In [2]:
# Install missing packages
%pip install scikit-learn

# Import libraries
import pandas as pd
from sklearn.utils import resample

# Load dataset
file_path = r'C:\Users\Nino\Documents\hateDetection-ETH\TweethateETHMU.csv'
data_raw = pd.read_csv(file_path, encoding='latin1', on_bad_lines='skip')

# Clean column names
data_raw.columns = data_raw.columns.str.replace(r'[";]', '', regex=True)

# Select relevant columns
data_cleaned = data_raw[['full_text', 'usernamelabel']].copy()
data_cleaned.rename(columns={'full_text': 'text', 'usernamelabel': 'label'}, inplace=True)

# Extract labels
data_cleaned['label'] = data_cleaned['label'].fillna('').astype(str)
data_cleaned['label'] = data_cleaned['label'].apply(lambda x: x.split(';')[-1] if ';' in x else x)

# Remove duplicates and invalid labels
data_cleaned.drop_duplicates(subset='text', inplace=True)
data_cleaned = data_cleaned[data_cleaned['label'].isin(['hate', 'no-hate'])]

# Check initial class distribution
initial_distribution = data_cleaned['label'].value_counts()
print("Initial Distribution:")
print(initial_distribution)

# Separate classes
no_hate = data_cleaned[data_cleaned['label'] == 'no-hate']
hate = data_cleaned[data_cleaned['label'] == 'hate']

# Oversample classes
hate_upsampled = resample(hate, replace=True, n_samples=300, random_state=42)
no_hate_upsampled = resample(no_hate, replace=True, n_samples=700, random_state=42)

# Combine oversampled data
data_balanced = pd.concat([no_hate_upsampled, hate_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

# Check balanced distribution
balanced_distribution = data_balanced['label'].value_counts()
print("Balanced Distribution:")
print(balanced_distribution)

# Save the oversampled dataset
data_balanced.to_csv('Oversampled_Tweet_Dataset.csv', index=False, encoding='utf-8')
print("Dataset saved as 'Oversampled_Tweet_Dataset.csv'")


Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.0-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.2/11.1 MB 3.1 MB/s eta 0:00:04
   -- ------------------------------------- 0.8/11.1 MB 8.0 MB/s eta 0:00:02
   ----- ---------------------------------- 1.5/11.1 MB 10.5 MB/s eta 0:00:01
   -------- ------------------------------- 2.3/11.1 MB 12.0 MB/s eta 0:00:01
   ---------- ----------------------------- 2.9/11.1 MB 12.3 MB/s eta 0:00:01
   ------------ --------------------------- 3.6/11.1 MB 12.8 MB/s eta 0:00:01
   --------------- ------------------------ 4.4/11.1 MB 13.3 MB/s eta 0:00:01
   ------------------ --------------------- 5.2/11.1 MB 13.9 MB/s eta 0:00:01
   ------------------


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\Nino\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Initial Distribution:
label
no-hate    257
hate        98
Name: count, dtype: int64
Balanced Distribution:
label
no-hate    700
hate       300
Name: count, dtype: int64
Dataset saved as 'Oversampled_Tweet_Dataset.csv'
