In [1]:
import pandas as pd

dataset_dir = "data/train_test_val"

DATASET = {
    'train': pd.read_csv(dataset_dir + '/train.csv').reset_index(drop=True), 
    'test': pd.read_csv(dataset_dir + '/test.csv').reset_index(drop=True), 
    'val': pd.read_csv(dataset_dir + '/val.csv').reset_index(drop=True), 
}

In [2]:
LABELS = [label for label in DATASET['train'].keys() if label not in ['ID', 'Text']]
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}
LABELS

['Murder',
 'Homicide',
 'Robbery',
 'Physical Injuries',
 'Rape',
 'Theft',
 'Carnapping',
 'Others']

In [1]:
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet
from parrot import Parrot
import torch

nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize Parrot paraphraser
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=torch.cuda.is_available())


[nltk_data] Downloading package wordnet to /home/syke/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/syke/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
def paraphrase_sentence(text):
    paraphrases = parrot.augment(input_phrase=text)
    if paraphrases:
        return paraphrases[0]  # Return the first paraphrase
    return text


In [3]:
def augment_dataset(dataset, num_samples_needed, augment_func):
    augmented_data = []
    current_data_size = len(dataset)
    
    while len(augmented_data) < num_samples_needed:
        for _, row in dataset.iterrows():
            text = row['Text']
            
            # Apply the augmentation function (synonym replacement or paraphrasing)
            augmented_text = augment_func(text)
            
            # Create a new row with the augmented text and the same label(s)
            new_row = row.copy()
            new_row['Text'] = augmented_text
            augmented_data.append(new_row)
            
            # Stop once we've generated enough samples
            if len(augmented_data) >= num_samples_needed:
                break

    return pd.DataFrame(augmented_data)


In [4]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

def synonym_replacement(text, n=2):
    words = text.split()
    new_text = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_text = [synonym if word == random_word else word for word in new_text]
            num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_text)


In [5]:
# Dataset paths
dataset_dir = "data/train_test_val"

# Load the datasets
DATASET = {
    'train': pd.read_csv(dataset_dir + '/train.csv').reset_index(drop=True), 
    'test': pd.read_csv(dataset_dir + '/test.csv').reset_index(drop=True), 
    'val': pd.read_csv(dataset_dir + '/val.csv').reset_index(drop=True), 
}

# Set the number of new samples needed
NUM_TRAIN = 600
NUM_TEST = 300
NUM_VAL = 100



In [6]:
# Augment the datasets
print("Augmenting training data...")
augmented_train = augment_dataset(DATASET['train'], NUM_TRAIN, synonym_replacement)
augmented_train_full = pd.concat([DATASET['train'], augmented_train], ignore_index=True)
augmented_train_full.to_csv(dataset_dir + '/augmented_train.csv', index=False)




Augmenting training data...


In [7]:
print("Augmenting test data...")
augmented_test = augment_dataset(DATASET['test'], NUM_TEST, paraphrase_sentence)
augmented_test_full = pd.concat([DATASET['test'], augmented_test], ignore_index=True)
augmented_test_full.to_csv(dataset_dir + '/augmented_test.csv', index=False)



Augmenting test data...




In [8]:

print("Augmenting validation data...")
augmented_val = augment_dataset(DATASET['val'], NUM_VAL, paraphrase_sentence)
augmented_val_full = pd.concat([DATASET['val'], augmented_val], ignore_index=True)
augmented_val_full.to_csv(dataset_dir + '/augmented_val.csv', index=False)

# Combine original and augmented datasets

# Save the augmented datasets

print("Augmentation complete and saved!")

Augmenting validation data...
Augmentation complete and saved!


In [6]:
import pandas as pd
import ast  # To safely evaluate the string representation of a tuple

# Path to your augmented CSV files
augmented_test_file = "data/train_test_val/augmented_test.csv"
augmented_val_file = "data/train_test_val/augmented_val.csv"

# Load the CSV files
# augmented_test = pd.read_csv(augmented_test_file)
augmented_val = pd.read_csv(augmented_val_file)

# Function to extract only the text from the tuple
def extract_text(text_with_score):
    # Safely evaluate the tuple string to convert it into a Python tuple
    try:
        text_tuple = ast.literal_eval(text_with_score)
        return text_tuple[0]  # Return only the text part
    except (ValueError, SyntaxError):
        # In case the value is not a tuple, just return it as is
        return text_with_score

# Apply the function to the 'Text' column (or the column containing the paraphrased text)
augmented_test['Text'] = augmented_test['Text'].apply(extract_text)
augmented_val['Text'] = augmented_val['Text'].apply(extract_text)

# Save the cleaned CSV files back
# augmented_test.to_csv("data/train_test_val/cleaned_augmented_test.csv", index=False)
augmented_val.to_csv("data/train_test_val/cleaned_augmented_val.csv", index=False)

print("Cleaned and saved the augmented test and validation datasets!")


Cleaned and saved the augmented test and validation datasets!


In [3]:
label_counts = {label: DATASET['train'][label].sum() for label in LABELS}

for label, count in label_counts.items():
    print(f"{label}: {count}")


Murder: 344
Homicide: 356
Robbery: 411
Physical Injuries: 351
Rape: 385
Theft: 316
Carnapping: 347
Others: 300


### Test label count

In [4]:
label_counts = {label: DATASET['test'][label].sum() for label in LABELS}

for label, count in label_counts.items():
    print(f"{label}: {count}")


Murder: 179
Homicide: 179
Robbery: 209
Physical Injuries: 181
Rape: 200
Theft: 158
Carnapping: 148
Others: 153


In [5]:
train_ids = set(DATASET['train']['Text'])
val_ids = set(DATASET['test']['Text'])
test_ids = set(DATASET['val']['Text'])

train_val_overlap = train_ids.intersection(val_ids)
train_test_overlap = train_ids.intersection(test_ids)
val_test_overlap = val_ids.intersection(test_ids)

print(f"Number of overlapping texts between train and val: {len(train_val_overlap)}")
print(f"Number of overlapping texts between train and test: {len(train_test_overlap)}")
print(f"Number of overlapping texts between val and test: {len(val_test_overlap)}")


print(len(DATASET['train']['Text']))
print(len(DATASET['test']['Text']))
print(len(DATASET['val']['Text']))


Number of overlapping texts between train and val: 134
Number of overlapping texts between train and test: 55
Number of overlapping texts between val and test: 28
2400
1200
400


# Word Frequency

In [6]:
import pandas as pd
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

STOP_WORDS = set(stopwords.words('english'))

def most_common_words(label, dataframe, n=30):
    """
    Given a label and a dataframe, returns the n most common words excluding stop words.

    Parameters:
    - label (str): The label to filter by (e.g., 'Murder', 'Robbery').
    - dataframe (pd.DataFrame): The dataframe containing the text data.
    - n (int): The number of most common words to return.

    Returns:
    - List of tuples with the most common words and their frequencies.
    """
    # Filter the dataframe by the given label where the label is 1
    label_data = dataframe[dataframe[label] == 1]

    # Combine all text entries into one large string
    all_text = ' '.join(label_data['Text'].tolist())

    # Convert text to lowercase
    all_text = all_text.lower()

    # Remove punctuation
    all_text = all_text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text into words
    words = word_tokenize(all_text)

    # Filter out stop words and words with length less than 2
    filtered_words = [word for word in words if word not in STOP_WORDS and len(word) > 1]

    # Count word frequencies
    word_freq = Counter(filtered_words)

    # Return the n most common words
    return word_freq.most_common(n)

for label in LABELS:
    print(f"\nMost common words for label '{label}':")
    common_words = most_common_words(label, DATASET['train'], n=20)
    
    
    for idx, (word, frequency) in enumerate(common_words, start=1):
        print(f" {idx}. {word}: {frequency}")


[nltk_data] Downloading package stopwords to /home/syke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/syke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Most common words for label 'Murder':
 1. saw: 154
 2. man: 140
 3. murder: 129
 4. house: 110
 5. shot: 110
 6. like: 106
 7. act: 90
 8. heard: 86
 9. one: 81
 10. calculated: 79
 11. planned: 77
 12. started: 74
 13. way: 74
 14. coldblooded: 73
 15. left: 72
 16. deliberate: 61
 17. guy: 60
 18. clear: 59
 19. found: 58
 20. made: 58

Most common words for label 'Homicide':
 1. man: 269
 2. saw: 172
 3. heard: 135
 4. head: 127
 5. one: 125
 6. hit: 116
 7. started: 109
 8. victim: 103
 9. fell: 83
 10. woman: 81
 11. trying: 75
 12. unintentional: 72
 13. tried: 71
 14. pulled: 70
 15. loud: 68
 16. like: 66
 17. grabbed: 65
 18. knife: 64
 19. floor: 64
 20. ran: 64

Most common words for label 'Robbery':
 1. man: 313
 2. using: 127
 3. leaving: 123
 4. saw: 123
 5. money: 105
 6. car: 103
 7. terrifying: 101
 8. wallet: 97
 9. valuables: 91
 10. heard: 90
 11. hand: 88
 12. tactics: 81
 13. left: 77
 14. house: 76
 15. gun: 74
 16. employing: 73
 17. one: 72
 18. menacing: 71
 

# Augmentation

In [7]:
import pandas as pd

# Paths to your original and cleaned augmented CSV files
dataset_dir = "data/train_test_val/"
original_train_file = dataset_dir + "train.csv"
original_test_file = dataset_dir + "test.csv"
original_val_file = dataset_dir + "val.csv"

cleaned_augmented_train_file = dataset_dir + "cleaned_augmented_train.csv"
cleaned_augmented_test_file = dataset_dir + "cleaned_augmented_test.csv"
cleaned_augmented_val_file = dataset_dir + "cleaned_augmented_val.csv"

# Load the original and cleaned augmented CSV files
original_train = pd.read_csv(original_train_file)
original_test = pd.read_csv(original_test_file)
original_val = pd.read_csv(original_val_file)

cleaned_augmented_train = pd.read_csv(cleaned_augmented_train_file)
cleaned_augmented_test = pd.read_csv(cleaned_augmented_test_file)
cleaned_augmented_val = pd.read_csv(cleaned_augmented_val_file)

# Concatenate the original and augmented data
new_train = pd.concat([original_train, cleaned_augmented_train], ignore_index=True)
new_test = pd.concat([original_test, cleaned_augmented_test], ignore_index=True)
new_val = pd.concat([original_val, cleaned_augmented_val], ignore_index=True)

# Save the merged datasets to new CSV files
# new_train.to_csv(dataset_dir + "new_train.csv", index=False)
# new_test.to_csv(dataset_dir + "new_test.csv", index=False)
new_val.to_csv(dataset_dir + "new_val.csv", index=False)

print("Merged datasets saved as new_train.csv, new_test.csv, and new_val.csv!")


Merged datasets saved as new_train.csv, new_test.csv, and new_val.csv!


In [8]:
import pandas as pd
import re

def preprocess(text):
    # Convert to lowercase
    text = re.sub(r'[A-Z]', lambda y: y.group(0).lower(), text)
    
    # Remove unimportant links
    text = re.sub(r'http[s]?://\S+', '', text)

    # Remove emojis (non-ASCII characters)
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove usernames (mentions)
    text = re.sub(r'@\w+', '', text)

    # Remove punctuations and replace with space
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove hashtags (keeping the text but removing the # symbol)
    text = re.sub(r'#', '', text)
    
    return text

dataset_dir = "data/train_test_val/"
new_train_file = dataset_dir + "new_train.csv"
new_test_file = dataset_dir + "new_test.csv"
new_val_file = dataset_dir + "new_val.csv"

new_train = pd.read_csv(new_train_file)
new_test = pd.read_csv(new_test_file)
new_val = pd.read_csv(new_val_file)

new_train['Text'] = new_train['Text'].apply(preprocess)
new_test['Text'] = new_test['Text'].apply(preprocess)
new_val['Text'] = new_val['Text'].apply(preprocess)

# new_train.to_csv(dataset_dir + "preprocessed_train.csv", index=False)
# new_test.to_csv(dataset_dir + "preprocessed_test.csv", index=False)
new_val.to_csv(dataset_dir + "preprocessed_val.csv", index=False)

print("Preprocessing complete and saved as preprocessed_train.csv, preprocessed_test.csv, and preprocessed_val.csv!")


Preprocessing complete and saved as preprocessed_train.csv, preprocessed_test.csv, and preprocessed_val.csv!
