In [None]:
"""
Script for preprocessing and augmented text datasets.

This script performs the following steps:
1. As the data augmentation in LLAMA2_augmentation_classification.ipynb results in different files for each class,
    this script concatenates multiple CSV files for each category (violent crime, cybercrime, weapons trade, drugs trade).
2. Removes duplicate rows and text duplicates within each category.
3. Filters out rows where the text starts with specific patterns (e.g., 'Here') for further inspection.
    --> This part is manual, as it requires some inspection, the aim is to filter out instances were the Desired Format (see prompt)
        is repeated, or similar patterns like 'Here is one __ example'
4. Cleans the text data by removing phone numbers, email addresses, non-ASCII characters, and uncommon punctuation.
5. Divides the cleaned datasets into train and trainval augmented sets based on provided train.csv and trainval.csv files.
7. Saves the resulting datasets as CSV files in the 'processed_datasets' folder for each category.

Usage:
- Make sure to customize the file paths, prefixes, and file numbers according to your dataset structure.
- Ensure that the 'train.csv' and 'trainval.csv' files are available for merging.

Note: The script assumes that the columns 'text', 'label', and 'snapshot_id' are present in the original datasets.
      The functions need to be called per each class individually.

"""


In [1]:
import pandas as pd
import numpy
import re

In [2]:
def concat_files(prefix: str, num_files: int) -> pd.DataFrame:
    """
    Concatenates multiple CSV files into a single DataFrame for a given prefix and number of files.

    Parameters:
    - prefix (str): The location of the CSV files.
    - num_files (int): The number of CSV files to concatenate.

    It assumes that the files are saved with a prefix and number, as suggested in LLAMA2_augmentation_classification.ipynb

    Returns:
    pd.DataFrame: A DataFrame containing concatenated data from all specified files.
    """

    df_list = []

    for i in range(1, num_files + 1):
        df = pd.read_csv(f"{prefix}{i}.csv")
        df_filtered = df[df['is synthetic'] == True][['text', 'label', 'snapshot_id']].dropna().copy() 
        #augmented sets comprehend both original and synthetic examples, in this stage we filter only the synthetic ones
        df_list.append(df_filtered)

    return pd.concat(df_list, ignore_index=True)


In [3]:
#file_prefix = "datasets_snapshot_id/aug_violent_crime_"
#num_files = 35
df_aug_violent_crime = concat_files(file_prefix, num_files)

In [4]:
#file_prefix = "datasets_snapshot_id/aug_cybercrime_"
#num_files = 4
df_aug_cybercrime = concat_files(file_prefix, num_files)

In [5]:
#file_prefix = "datasets_snapshot_id/aug_weapons_trade_"
#num_files = 9
df_aug_weapons_trade = concat_files(file_prefix, num_files)

In [6]:
#file_prefix = "datasets_snapshot_id/aug_drugs_trade_"
#num_files = 2
df_aug_drugs_trade = concat_files(file_prefix, num_files)

Remove duplicates

In [7]:
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    duplicates = df.duplicated()
    df_cleaned = df[~duplicates].copy()
    return df_cleaned

def remove_text_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    duplicates = df['text'].duplicated()
    df_cleaned = df[~duplicates].copy()
    return df_cleaned

In [8]:
df_aug_violent_crime = remove_duplicates(df_aug_violent_crime)
df_aug_cybercrime = remove_duplicates(df_aug_cybercrime)
df_aug_weapons_trade = remove_duplicates(df_aug_weapons_trade)
df_aug_drugs_trade = remove_duplicates(df_aug_drugs_trade)

In [10]:
df_aug_violent_crime = remove_text_duplicates(df_aug_violent_crime)
df_aug_cybercrime = remove_text_duplicates(df_aug_cybercrime)
df_aug_weapons_trade = remove_text_duplicates(df_aug_weapons_trade)
df_aug_drugs_trade = remove_text_duplicates(df_aug_drugs_trade)

Additional patterns 

In [11]:
def filter_by_start_texts(df: pd.DataFrame, start_texts: List[str]) -> pd.DataFrame:
    start_texts_lower = [start_text.lower() for start_text in start_texts]
    condition = df['text'].str[:4].str.lower().isin(start_texts_lower)
    return df[condition]
#Text entries starting with 'here' are inspected manually, and the index eventually dropped

start_texts = ['Here']

In [None]:
filter_by_start_texts(df_aug_violent_crime, start_texts)

In [13]:
#df_aug_violent_crime = df_aug_violent_crime.drop([69, 461, 569]).reset_index(drop=True)
#Insert indexed of entries to drop

In [None]:
filter_by_start_texts(df_aug_cybercrime, start_texts)

In [15]:
#df_aug_cybercrime = df_aug_cybercrime.drop([130]).reset_index(drop= True)
#Insert indexed of entries to drop

In [None]:
filter_by_start_texts(df_aug_weapons_trade, start_texts)

In [17]:
#df_aug_weapons_trade = df_aug_weapons_trade.drop([102, 139, 331, 353, 433, 602, 714, 773]).reset_index(drop=True)
#Insert indexed of entries to drop

In [None]:
filter_by_start_texts(df_aug_drugs_trade, start_texts)

In [19]:
#df_aug_drugs_trade = df_aug_drugs_trade.drop([136, 162, 163, 214, 254, 278, 425, 432, 446]).reset_index(drop=True)
#Insert indexed of entries to drop

Extend original preprocessing steps

In [20]:
def clean_text(df: pd.DataFrame) -> pd.DataFrame:
    """
    Performs text cleaning on a DataFrame containing a 'text' column.

    The cleaning process includes:
    1. Removing rows with less than 10 words.
    2. Removing phone numbers.
    3. Removing email addresses.
    4. Removing non-ASCII characters.
    5. Removing uncommon punctuation.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with a 'text' column.

    Returns:
    pd.DataFrame: A new DataFrame with cleaned 'text' column.
    """
   
    df['text_len'] = df['text'].apply(lambda x:len(x.split(' ')))

    df = df[df.text_len >= 10].copy()
    df['text'] = df['text'].apply(lambda text: re.sub(r'\b(?:\+\d{1,2}\s?)?\(?(?:\d{1,4})?\)?[-.\s]?\d{1,5}[-.\s]?\d{1,5}[-.\s]?\d{1,9}\b', '', text))
    df['text'] = df['text'].apply(lambda text: re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', '', text))
    df['text'] = df['text'].apply(lambda text: re.sub(r'[^\x00-\x7F]+', '', text))
    df['text'] = df['text'].apply(lambda text: re.sub(r'[^\w\s.?!,:;\'"\d-]', '', text))
    
    return df

df_aug_violent_crime_pro = clean_text(df_aug_violent_crime)
df_aug_weapons_trade_pro = clean_text(df_aug_weapons_trade)
df_aug_cybercrime_pro = clean_text(df_aug_cybercrime)
df_aug_drugs_trade_pro = clean_text(df_aug_drugs_trade)


In [None]:
"""
The following lines of code are aimed at dividing the overall processed augmented sets into training and trainval sets,
based on the 'snapshot_id', which identifies the original entry that was augmented. 

This is achieved by merging each augmented set with the original training set on snapshot_id and subsetting instances present in both sets.

These steps are repeated for each class. Finally, each training set is aggregated into the final set of augmented entries, by sampling the number of entries needed to reach the majority class.

Specify the path (based on configuration) and uncomment the line if saving is desired.

"""


Divide into train and trainval

In [22]:
df_train = pd.read_csv('train.csv', index_col=0)
df_trainval = pd.read_csv('trainval.csv', index_col=0)

In [None]:
df_merge_violent_crime = pd.merge(df_aug_violent_crime_pro, df_train, how='left', on='snapshot_id', indicator=True)
df_aug_train_violent_crime = df_merge_violent_crime[df_merge_violent_crime._merge == 'both'][['text_x', 'label_x', 'snapshot_id']]
df_aug_train_violent_crime.columns = [['text', 'label', 'snapshot_id']]
df_aug_train_violent_crime.shape
#Check that shape is at least the required volume to reach the majority class

In [None]:
df_merge_cybercrime = pd.merge(df_aug_cybercrime_pro, df_train, how='left', on='snapshot_id', indicator=True)
df_aug_train_cybercrime = df_merge_cybercrime[df_merge_cybercrime._merge == 'both'][['text_x', 'label_x', 'snapshot_id']]
df_aug_train_cybercrime.columns = [['text', 'label', 'snapshot_id']]
df_aug_train_cybercrime.shape
#Check that shape is at least the required volume to reach the majority class

In [None]:
df_merge_weapons_trade = pd.merge(df_aug_weapons_trade_pro, df_train, how='left', on='snapshot_id', indicator=True)
df_aug_train_weapons_trade = df_merge_weapons_trade[df_merge_weapons_trade._merge == 'both'][['text_x', 'label_x', 'snapshot_id']]
df_aug_train_weapons_trade.columns = [['text', 'label', 'snapshot_id']]
df_aug_train_weapons_trade.shape
#Check that shape is at least the required volume to reach the majority class

In [None]:
df_merge_drugs_trade = pd.merge(df_aug_drugs_trade_pro, df_train, how='left', on='snapshot_id', indicator=True)
df_aug_train_drugs_trade = df_merge_drugs_trade[df_merge_drugs_trade._merge == 'both'][['text_x', 'label_x', 'snapshot_id']]
df_aug_train_drugs_trade.columns = [['text', 'label', 'snapshot_id']]
df_aug_train_drugs_trade.shape
#Check that shape is at least the required volume to reach the majority class

In [None]:
df_merge_violent_crime = pd.merge(df_aug_violent_crime_pro, df_trainval, how='left', on='snapshot_id', indicator=True)
df_aug_trainval_violent_crime = df_merge_violent_crime[df_merge_violent_crime._merge == 'both'][['text_x', 'label_x', 'snapshot_id']]
df_aug_trainval_violent_crime.columns = [['text', 'label', 'snapshot_id']]
df_aug_trainval_violent_crime.shape
#Check that shape is at least the required volume to reach the majority class

In [None]:
df_merge_cybercrime = pd.merge(df_aug_cybercrime_pro, df_trainval, how='left', on='snapshot_id', indicator=True)
df_aug_trainval_cybercrime = df_merge_cybercrime[df_merge_cybercrime._merge == 'both'][['text_x', 'label_x', 'snapshot_id']]
df_aug_trainval_cybercrime.columns = [['text', 'label', 'snapshot_id']]
df_aug_trainval_cybercrime.shape
#Check that shape is at least the required volume to reach the majority class

In [None]:
df_merge_weapons_trade = pd.merge(df_aug_weapons_trade_pro, df_trainval, how='left', on='snapshot_id', indicator=True)
df_aug_trainval_weapons_trade = df_merge_weapons_trade[df_merge_weapons_trade._merge == 'both'][['text_x', 'label_x', 'snapshot_id']]
df_aug_trainval_weapons_trade.columns = [['text', 'label', 'snapshot_id']]
df_aug_trainval_weapons_trade.shape
#Check that shape is at least the required volume to reach the majority class

In [None]:
df_merge_drugs_trade = pd.merge(df_aug_drugs_trade_pro, df_trainval, how='left', on='snapshot_id', indicator=True)
df_aug_trainval_drugs_trade = df_merge_drugs_trade[df_merge_drugs_trade._merge == 'both'][['text_x', 'label_x', 'snapshot_id']]
df_aug_trainval_drugs_trade.columns = [['text', 'label', 'snapshot_id']]
df_aug_trainval_drugs_trade.shape
#Check that shape is at least the required volume to reach the majority class

Aggregate into final set of augmented entries

In [None]:
df_aug_train_violent_crime = df_aug_train_violent_crime.sample(n=430, random_state=123).reset_index(drop=True).copy()
df_aug_train_weapons_trade = df_aug_train_weapons_trade.sample(n=400, random_state=123).reset_index(drop=True).copy()
df_aug_train_drugs_trade = df_aug_train_drugs_trade.sample(n=206, random_state=123).reset_index(drop=True).copy()
df_aug_train_cybercrime = df_aug_train_cybercrime.sample(n=116, random_state=123).reset_index(drop=True).copy()

augmented_train = pd.concat([df_aug_train_violent_crime, df_aug_train_weapons_trade, df_aug_train_drugs_trade, df_aug_train_cybercrime], ignore_index=True)
#augmented_train.to_csv('augmented_train.csv')
#Name it according to the configuration
#Uncommenting it will overwrite the information


In [None]:
df_aug_trainval_violent_crime = df_aug_trainval_violent_crime.sample(n=575, random_state=123).reset_index(drop=True).copy()
df_aug_trainval_weapons_trade = df_aug_trainval_weapons_trade.sample(n=535, random_state=123).reset_index(drop=True).copy()
df_aug_trainval_drugs_trade = df_aug_trainval_drugs_trade.sample(n=275, random_state=123).reset_index(drop=True).copy()
df_aug_trainval_cybercrime = df_aug_trainval_cybercrime.sample(n=155, random_state=123).reset_index(drop=True).copy()

augmented_trainval = pd.concat([df_aug_trainval_violent_crime, df_aug_trainval_weapons_trade, df_aug_trainval_drugs_trade, df_aug_trainval_cybercrime], ignore_index=True)
#augmented_trainval.to_csv('augmented_trainval.csv')
#Name it according to the configuration
#Uncommenting it will overwrite the information

Preparing samples for few-shot augmentation

In [None]:
"""
This code prepares samples for few-shot augmentation by selecting examples from the original training set and their corresponding augmented counterparts for specific categories (violent crime, cybercrime, weapons trade, drugs trade). 

The process involves the following steps:

1. Load Datasets:
   - Load the original training dataset.
   - Load the augmented datasets for different categories.

2. Sample Examples:
   - Sample a specified number (15) of examples from each augmented category to use in few-shot augmentation.

3. Merge Samples with Original Training Examples:
   - For each category, merge the sampled synthetic examples with their corresponding examples from the original training set based on the 'snapshot_id'.
   - Rename columns to distinguish between synthetic and original examples.

4. Optional: Save Samples to CSV:
   - The resulting samples, containing both synthetic and original examples, can be saved to CSV files.
     - Uncomment the relevant lines to save samples for each category to your preferred path.

The goal is to create datasets for few-shot augmentation, where examples from the original training set serve as input demonstrations, and the corresponding augmented examples serve as output demonstrations.

"""


In [10]:
#df_train = pd.read_csv('train.csv', index_col=0) #for the FS, we only sample example from the train set

In [11]:
#df_aug_train = pd.read_csv('augmented_train_quant.csv', index_col=0) #Specify path to augmented dataset
df_aug_violent_crime = df_aug_train[df_aug_train.label == 'Violent Crime'].copy()
df_aug_drugs_trade = df_aug_train[df_aug_train.label == 'Drugs and Narcotics Trade'].copy()
df_aug_cybercrime = df_aug_train[df_aug_train.label == 'Cybercrime'].copy()
df_aug_weapons_trade = df_aug_train[df_aug_train.label == 'Weapons Trade'].copy()

In [12]:
sample_violent_crime_train = df_aug_violent_crime.sample(n=15, random_state=123)
sample_violent_crime_all = pd.merge(sample_violent_crime_train[['text', 'snapshot_id', 'label']], df_train[['text', 'snapshot_id']], on= 'snapshot_id', how='left', suffixes=('_synthetic', '_original'))

In [13]:
sample_cybercrime_train = df_aug_cybercrime.sample(n=15, random_state=123)
sample_cybercrime_all = pd.merge(sample_cybercrime_train[['text', 'snapshot_id', 'label']], df_train[['text', 'snapshot_id']], on= 'snapshot_id', how='left', suffixes=('_synthetic', '_original'))

In [14]:
sample_weapons_trade_train = df_aug_weapons_trade.sample(n=15, random_state=123)
sample_weapons_trade_all = pd.merge(sample_weapons_trade_train[['text', 'snapshot_id', 'label']], df_train[['text', 'snapshot_id']], on= 'snapshot_id', how='left', suffixes=('_synthetic', '_original'))

In [15]:
sample_drugs_trade_train = df_aug_drugs_trade.sample(n=15, random_state=123)
sample_drugs_trade_all = pd.merge(sample_drugs_trade_train[['text', 'snapshot_id', 'label']], df_train[['text', 'snapshot_id']], on= 'snapshot_id', how='left', suffixes=('_synthetic', '_original'))

In [16]:
#sample_violent_crime_all.to_csv('zs_sample_violent_crime_quant.csv')
#sample_cybercrime_all.to_csv('zs_sample_cybercrime_quant.csv')
#sample_weapons_trade_all.to_csv('zs_sample_weapons_trade_quant.csv')
#sample_drugs_trade_all.to_csv('zs_sample_drugs_trade_quant.csv')