In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import os
import keras
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import LSTM, SpatialDropout1D, Activation, GlobalAveragePooling1D, Dense, Input, Dropout, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.metrics import ConfusionMatrixDisplay


%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [29]:
# Load dataset
def load_data():
  data=pd.read_csv('https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv')
  data=data.drop('Unnamed: 0', axis=1)
  data=data.drop(labels=['hate_speech', 'offensive_language', 'neither', 'count'], axis=1)
  return data


In [30]:
label={0:'hate_speech', 1:'offensive_language', 2:'neither'}

 Cleaning tweets

In [31]:
# Data cleaning
def clean_text(data):
  for i in range(len(data)):
    txt = data.loc[i]["tweet"]
    txt=re.sub(r'@[A-Z0-9a-z_:]+','',txt) #replace username-tags
    txt=re.sub(r'^[RT]+','',txt) #replace RT-tags
    txt = re.sub('https?://[A-Za-z0-9./]+','',txt) #replace URLs
    txt=re.sub("[^a-zA-Z]", " ",txt) #replace hashtags
    data.at[i,"tweet"]=txt
  return data

In [32]:
# Splitting data into train-test with 15 as testset
def split_data(data):
  X=data['tweet']
  Y=data['class']
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=20)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=20)
  return X_train, y_train, X_test, y_test, X_val, y_val


In [33]:
data = load_data()


Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


In [38]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [40]:
data

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


In [41]:
data = clean_text(data)


In [42]:
data[0:10]

Unnamed: 0,class,tweet
0,2,RT As a woman you shouldn t complain abou...
1,1,RT boy dats cold tyga dwn bad for cuf...
2,1,RT Dawg RT You ever fuck a bitch...
3,1,RT she look like a tranny
4,1,RT The shit you hear about me m...
5,1,The shit just blows me cl...
6,1,I can not just sit up and HATE on anot...
7,1,cause I m tired of you big bitches...
8,1,amp you might not get ya bitch back amp ...
9,1,hobbies include fighting Mariam bitch


In [None]:
def clean_text(data):
  for i in range(len(data)):
    txt = data.loc[i]["tweet"]
    txt=re.sub(r'@[A-Z0-9a-z_:]+','',txt) #replace username-tags
    txt=re.sub(r'^[RT]+','',txt) #replace RT-tags
    txt = re.sub('https?://[A-Za-z0-9./]+','',txt) #replace URLs
    txt=re.sub("[^a-zA-Z]", " ",txt) #replace hashtags
    data.at[i,"tweet"]=txt
  return data

In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = split_data(data)

In [45]:
print(X_train)
print(y_train)

17993      I hate when faggots talk shit on here then b...
15344                                          Dumb bitch 
2605                   lets record a song bitch right nowb
16792       I m a joke having hoes ain t shit  I m just...
17331      It s not as easy as you d think to find an a...
                               ...                        
10795    I need to pick between two wigs  The agony    ...
14591                     I feel like Tim Ross is a nigger
14728      Horrible rapper    Tyga   Most trash rapper ...
18597      Contrary to belief  us queers don       t go...
24099    it means that im following orders and being su...
Name: tweet, Length: 16852, dtype: object
17993    0
15344    1
2605     1
16792    1
17331    2
        ..
10795    2
14591    0
14728    0
18597    1
24099    0
Name: class, Length: 16852, dtype: int64


### Making train csv file

In [64]:
df_train = y_train.to_frame().join(X_train)

In [65]:
print(df_train)


       class                                              tweet
17993      0    I hate when faggots talk shit on here then b...
15344      1                                        Dumb bitch 
2605       1                lets record a song bitch right nowb
16792      1     I m a joke having hoes ain t shit  I m just...
17331      2    It s not as easy as you d think to find an a...
...      ...                                                ...
10795      2  I need to pick between two wigs  The agony    ...
14591      0                   I feel like Tim Ross is a nigger
14728      0    Horrible rapper    Tyga   Most trash rapper ...
18597      1    Contrary to belief  us queers don       t go...
24099      0  it means that im following orders and being su...

[16852 rows x 2 columns]


In [66]:
df_tr=df_train.reset_index(drop=True)

In [67]:
print(df_tr)

       class                                              tweet
0          0    I hate when faggots talk shit on here then b...
1          1                                        Dumb bitch 
2          1                lets record a song bitch right nowb
3          1     I m a joke having hoes ain t shit  I m just...
4          2    It s not as easy as you d think to find an a...
...      ...                                                ...
16847      2  I need to pick between two wigs  The agony    ...
16848      0                   I feel like Tim Ross is a nigger
16849      0    Horrible rapper    Tyga   Most trash rapper ...
16850      1    Contrary to belief  us queers don       t go...
16851      0  it means that im following orders and being su...

[16852 rows x 2 columns]


In [68]:
frequency_df = df_tr.groupby('class').size()
print(frequency_df)

class
0      973
1    13020
2     2859
dtype: int64


In [78]:
df_tr.to_csv('train_data_hs.csv')

### make test csv file

In [75]:
df_test = y_test.to_frame().join(X_test)
df_te=df_test.reset_index(drop=True)

In [76]:
print(df_te)

      class                                              tweet
0         1                            more than you run pussy
1         1    Them shits was dry as a bitch  Had to ask fo...
2         1    Some bitches be praying for the day you leav...
3         1    Thats when you say  this bitch aint got no j...
4         0  You a lame ass nigga still cuffin these hoes m...
...     ...                                                ...
3713      1  Ugly bitches know to get out the crib before t...
3714      1                         Dat colored guy be braisin
3715      1                                Scarlett Johansson 
3716      1   Just straight ignorant bitches  WhereTheyHeadsAt
3717      1    Niggas madd gay RT  teachers giving niggas p...

[3718 rows x 2 columns]


In [77]:
frequency_dft = df_te.groupby('class').size()
print(frequency_dft)

class
0     199
1    2894
2     625
dtype: int64


In [79]:
df_te.to_csv('test_data_hs.csv')

### make validation csv file

In [80]:
df_val = y_val.to_frame().join(X_val)
df_va=df_val.reset_index(drop=True)

In [81]:
print(df_va)

      class                                              tweet
0         1      yea u know these lil bitches be stalking a...
1         2    Fit lads  Nice gear    these scally lads n t...
2         1                      I got bad bitches on stand by
3         1  With no babymama drama   side hoes all that   ...
4         1      I think my wish came true  where y all hat...
...     ...                                                ...
4208      1                        sockin bitches in they mouf
4209      1                                     prairie nigger
4210      1     He s been drinking too much of that shitty ...
4211      1  Having a bitch ain t goofy  Having bitch dat a...
4212      1        Picture time outside of metro  fucking fags

[4213 rows x 2 columns]


In [83]:
frequency_dfv = df_va.groupby('class').size()
print(frequency_dfv)

class
0     258
1    3276
2     679
dtype: int64


In [84]:
df_va.to_csv('val_data_hs.csv')

### checking if indexes in test, train or val overlap

In [91]:
ixs = y_train.index.intersection(y_test.index)
y_train.loc[ixs]

Series([], Name: class, dtype: int64)