# Balancing data

## Import

In [1]:
import pandas as pd
import numpy as np

## Load and clean Data

In [2]:
# Read all data
data = pd.read_csv('data/all_data.csv')

In [3]:
# Drop columns
data.drop(['id', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count'], axis=1, inplace=True)

In [4]:
# Drop rows with null values
df_cleaned = data.dropna(subset=['comment_text'])

# Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0
df_train = df_cleaned.copy()
df_train['toxic'] = np.where(df_train['toxicity'] >= 0.50, 1, 0)

## Showing Unbalanced data

In [5]:
# Show Unbalanced Data
# Get counts and percentages
counts = df_train['toxic'].value_counts()
percentages = df_train['toxic'].value_counts(normalize=True) * 100
# Display counts and percentages
result_df = pd.DataFrame({'Count': counts, 'Percentage': percentages})
result_df['Percentage'] = result_df['Percentage'].map('{:.2f}%'.format)
print(result_df)

         Count Percentage
toxic                    
0      1839730     92.01%
1       159782      7.99%


## Under Sampling

In [6]:
# Calculate the undersample size for class 0
undersample_size_0 = int(df_train['toxic'].value_counts()[1] / 0.4 * 0.6)

# Undersample class 0 and keep all samples from class 1
undersampled_data = pd.concat([
    df_train[df_train['toxic'] == 0].sample(n=undersample_size_0, random_state=42),
    df_train[df_train['toxic'] == 1]],
    ignore_index=True)

# Shuffle the undersampled data
undersampled_data = undersampled_data.sample(frac=1, random_state=42)


In [7]:
# Get counts and percentages
counts = undersampled_data['toxic'].value_counts()
percentages = undersampled_data['toxic'].value_counts(normalize=True) * 100
# Display counts and percentages
result_df = pd.DataFrame({'Count': counts, 'Percentage': percentages})
result_df['Percentage'] = result_df['Percentage'].map('{:.2f}%'.format)
print(result_df)

        Count Percentage
toxic                   
0      239673     60.00%
1      159782     40.00%


In [8]:
#Save the Undersampled Data to a CSV File:
undersampled_data.to_csv('data/undersampled_data_60_40.csv', index=False)