In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support


## Loading Data

__dataset is split into train and test, with 11000 tweets in training set and the rest in test dataset__

In [3]:

# df=pd.read_csv("final_dataset.csv")
# test_data = df[11000:].copy()
# data = df[:11000].copy()
# print(data)

df=pd.read_csv("final_dataset.csv", names=['Tweet Id', 'Tweets', 'User Id', 'Screen Name', 'Class'])
test_data = df[11000:].copy()
data = df[:11000].copy()
# print(data)
print(data["Tweets"].size)

11000


## Data Preprocessing

In [4]:
data.isnull().values.any()

False

In [5]:
def tweet_processing(raw_tweet):
    letters_only=re.sub("[^a-zA-Z]"," ",raw_tweet)
    words=letters_only.lower().split()
    stops=set(stopwords.words("english"))
    m_w=[w for w in words if not w in stops]
    return (" ".join(m_w))

In [6]:
num_tweets=data["Tweets"].size
clean_tweet=[]
for i in range(0,num_tweets):
    clean_tweet.append(tweet_processing(data["Tweets"][i]))
data["Tweets"]=clean_tweet 


num_tweets_test=test_data["Tweets"].size
clean_tweet_test=[]
for i in range(num_tweets,num_tweets+num_tweets_test):
    clean_tweet_test.append(tweet_processing(test_data["Tweets"][i]))
test_data["Tweets"]=clean_tweet_test

## Baseline model : SVM

In [7]:
X_train, X_test_svm, Y_train, Y_test_svm = train_test_split(df.Tweets, df.Class, test_size=0.2)

In [8]:
vectorizer=CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000)

train_data_features=vectorizer.fit_transform(X_train)
train_data_features=train_data_features.toarray()

test_data_features=vectorizer.transform(X_test_svm)
test_data_features=test_data_features.toarray()

In [9]:
#SVM with linear kernel
clf=svm.SVC(kernel='linear',C=1.0)
print ("Training")
clf.fit(train_data_features,Y_train)

print ("Testing")
predicted=clf.predict(test_data_features)
accuracy=np.mean(predicted==Y_test_svm)
print ("Accuracy: ",accuracy)

Training
Testing
Accuracy:  0.8574580759046778


__precision,recall,F1 score for svm model__

In [10]:
from sklearn.metrics import precision_recall_fscore_support
score_svm=precision_recall_fscore_support(Y_test_svm, predicted, average='weighted')
print(score_svm)

(0.8534028329692774, 0.8574580759046778, 0.8544796030819005, None)


## Deep Learning Models

__creating one hot vector for classes(labels).__ 

In [11]:
data['sexism']=0
data['racism']=0
data['none']=0

data['sexism'] = np.where(data['Class'] == 'sexism', 1, 0)
data['racism'] = np.where(data['Class'] == 'racism', 1, 0)
data['none'] = np.where(data['Class'] == 'none', 1, 0)
#data.head()
columns=['sexism','racism','none']
y=data[columns].values
#print(y.shape)


test_data['sexism']=0
test_data['racism']=0
test_data['none']=0

test_data['sexism'] = np.where(test_data['Class'] == 'sexism', 1, 0)
test_data['racism'] = np.where(test_data['Class'] == 'racism', 1, 0)
test_data['none'] = np.where(test_data['Class'] == 'none', 1, 0)
#data.head()
columns=['sexism','racism','none']
y_test=test_data[columns].values

__Tokenizing words__

In [12]:
from nltk.tokenize import word_tokenize
#text1 = "It's true that the chicken was the best bamboozler in the known multiverse."
#tokens = word_tokenize(data['Tweets'])
data['tokenized_sents'] = data.apply(lambda column: word_tokenize(column['Tweets']), axis=1)
test_data['tokenized_sents'] = test_data.apply(lambda column: word_tokenize(column['Tweets']), axis=1)
df['tokenized_sents'] = df.apply(lambda column: word_tokenize(column['Tweets']), axis=1)


__Creating word embeddings__

In [13]:
from gensim.models import Word2Vec
vocab_size = 22194
# define training data
sentences = df['tokenized_sents']
# train model
model = Word2Vec(sentences,size=200,window =4,min_count=1,sg=1)
#print(model)
words = list(model.wv.vocab)
#print(len(words))
model.save('model.bin')

#model = Word2Vec.load('model.bin')
#print(model)


__creating embedding matrix__

In [14]:
embedding_matrix = np.zeros((vocab_size,200))

for i in range(0,len(words)):
    embedding_vector = model[words[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
embedding_matrix[vocab_size-1]= np.random.normal(scale=0.6, size=(200, ))
    
#print(embedding_matrix)

  after removing the cwd from sys.path.


__Using Keras to train LSTM model__

In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import LeakyReLU
from keras.models import load_model

__resizing each tweet to size 50__ 

In [16]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tokenized_sents'])
sequences = tokenizer.texts_to_sequences(data['tokenized_sents'])
X_t = pad_sequences(sequences, maxlen=50)

#print(vocab_size)
tokenizer.fit_on_texts(test_data['tokenized_sents'])
test_sequences = tokenizer.texts_to_sequences(test_data['tokenized_sents'])
X_test = pad_sequences(test_sequences, maxlen=50)


__calculating a user's tendency towards racism, sexism, neutrality by taking the ratio of number of tweets marked as a particular label and total number of tweets of that user. This is done for each label(sexism, racism and neutral)__

In [17]:
count1=[]
print(data['User Id'].values)
for i in data['User Id'].unique():
    #print(i)
    count1.append((data['User Id']== i).sum())
#print(count1)
neutral_count=[]
sexism_count=[]
racism_count=[]
for i in data['User Id'].unique():
    neutral_count.append(len(data[(data['User Id']==i) & (data['Class']=='none')]))
    sexism_count.append(len(data[(data['User Id']==i) & (data['Class']=='sexism')]))
    racism_count.append(len(data[(data['User Id']==i) & (data['Class']=='racism')])) 
    
count_test=[]
#print(data['User Id'].values)
for i in test_data['User Id'].unique():
    #print(i)
    count_test.append((test_data['User Id']== i).sum())
#print(count1)
neutral_count_test=[]
sexism_count_test=[]
racism_count_test=[]
for i in test_data['User Id'].unique():
    #neutral_count.append((data['User Id']==i) & (data['Class']== 'none').sum())
    neutral_count_test.append(len(test_data[(test_data['User Id']==i) & (test_data['Class']=='none')]))
    sexism_count_test.append(len(test_data[(test_data['User Id']==i) & (test_data['Class']=='sexism')]))
    racism_count_test.append(len(test_data[(test_data['User Id']==i) & (test_data['Class']=='racism')])) 
    


['User Id' '949380854' '297877558' ... '29424561' '289846547' '29424561']


In [18]:
X_len = len(data['User Id'].unique())

In [19]:
ratio_sexism=[]
ratio_neutral=[]
ratio_racism=[]

for i in range(0,X_len):
    ratio_sexism.append(sexism_count[i]/count1[i])
    ratio_racism.append(racism_count[i]/count1[i])
    ratio_neutral.append(neutral_count[i]/count1[i])
        
#make a column for ratio of eachc class

ratio_sexism_test=[]
ratio_neutral_test=[]
ratio_racism_test=[]

for i in range(0,len(test_data['User Id'].unique())):
    ratio_sexism_test.append(sexism_count_test[i]/count_test[i])
    ratio_racism_test.append(racism_count_test[i]/count_test[i])
    ratio_neutral_test.append(neutral_count_test[i]/count_test[i])


In [20]:
j=0
for i in data['User Id'].unique():
    
    #print(i)
    data.loc[data['User Id'] == i,'tendency_sexual'] = ratio_sexism[j]
    data.loc[data['User Id'] == i,'tendency_racism'] = ratio_racism[j]
    data.loc[data['User Id'] == i,'tendency_neutral'] = ratio_neutral[j]
    j=j+1

k=0
for i in test_data['User Id'].unique():
    
    #print(i)
    test_data.loc[test_data['User Id'] == i,'tendency_sexual'] = ratio_sexism_test[k]
    test_data.loc[test_data['User Id'] == i,'tendency_racism'] = ratio_racism_test[k]
    test_data.loc[test_data['User Id'] == i,'tendency_neutral'] = ratio_neutral_test[k]
    k=k+1

In [21]:
# tendency_sexism=data['tendency_sexual'].as_matrix()
# tendency_racism=data['tendency_racism'].as_matrix()
# tendency_neutral=data['tendency_neutral'].as_matrix()

tendency_sexism=data['tendency_sexual'].to_numpy()
tendency_racism=data['tendency_racism'].to_numpy()
tendency_neutral=data['tendency_neutral'].to_numpy()


tendency_sexism=tendency_sexism.reshape(len(tendency_sexism),1)
tendency_racism=tendency_sexism.reshape(len(tendency_racism),1)
tendency_neutral=tendency_sexism.reshape(len(tendency_neutral),1)
print(tendency_sexism.shape)

# tendency_sexism_test=test_data['tendency_sexual'].as_matrix()
# tendency_racism_test=test_data['tendency_racism'].as_matrix()
# tendency_neutral_test=test_data['tendency_neutral'].as_matrix()

tendency_sexism_test=test_data['tendency_sexual'].to_numpy()
tendency_racism_test=test_data['tendency_racism'].to_numpy()
tendency_neutral_test=test_data['tendency_neutral'].to_numpy()


tendency_sexism_test=tendency_sexism_test.reshape(len(tendency_sexism_test),1)
tendency_racism_test=tendency_sexism_test.reshape(len(tendency_racism_test),1)
tendency_neutral_test=tendency_sexism_test.reshape(len(tendency_neutral_test),1)
#print(tendency_sexism.shape)

(11000, 1)


__concatenating word vectors with tendencies of users calculated above for each label__

In [22]:
#print(X_t)
X1_t=np.concatenate((X_t, tendency_sexism), axis=1)
X1_t=np.concatenate((X1_t, tendency_racism), axis=1)
X1_t=np.concatenate((X1_t, tendency_neutral), axis=1)
print(X1_t.shape)


X1_test=np.concatenate((X_test, tendency_sexism_test), axis=1)
X1_test=np.concatenate((X1_test, tendency_racism_test), axis=1)
X1_test=np.concatenate((X1_test, tendency_neutral_test), axis=1)


(11000, 53)


__Training LSTM Model__

In [23]:
inp = Input(shape=(53,))
x = Embedding(vocab_size,200,weights=[embedding_matrix])(inp)
#print(x.values)
x = (LSTM(50, return_sequences=True, dropout=0.2, recurrent_dropout=0.4))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50)(x)
x=LeakyReLU(alpha=0.02)(x)
x = Dropout(0.2)(x)
x = Dense(3)(x)
x=LeakyReLU(alpha=0.02)(x)
model1 = Model(inputs=inp, outputs=x)
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model1.fit(X1_t,y, batch_size=32, epochs=5, validation_split=0.1);

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
#saving model
model1.save('model_lstm.h5')  

__training a bideirectional LSTM model__

In [25]:
inp_bi = Input(shape=(53,))
x_bi = Embedding(vocab_size,200,weights=[embedding_matrix])(inp_bi)
#print(x.values)
x_bi = Bidirectional(LSTM(50, return_sequences=True, dropout=0.2, recurrent_dropout=0.4))(x_bi)
x_bi = GlobalMaxPool1D()(x_bi)
x_bi = Dense(50)(x_bi)
x_bi=LeakyReLU(alpha=0.02)(x_bi)
x_bi = Dropout(0.2)(x_bi)
x_bi = Dense(3)(x_bi)
x_bi=LeakyReLU(alpha=0.02)(x_bi)
model_bi = Model(inputs=inp_bi, outputs=x_bi)
model_bi.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_bi.fit(X1_t,y, batch_size=32, epochs=5, validation_split=0.1);
model_bi.save('model_bi.h5') 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


__Calculating  precision,recall,F1score, of LSTM and bidirectional model__

In [26]:
predicted_lstm = model1.predict(X1_test)
predicted_bi = model_bi.predict(X1_test)

In [27]:
dataframe_lstm = pd.DataFrame.from_records(predicted_lstm)
dataframe_bi=pd.DataFrame.from_records(predicted_bi)

__predicted class is stored in a column of dataframe__

__bidirectional lstm results__

__get label function creates appropriate labels according to  predictions__

In [28]:

def get_label(df):
    if ((df[0] >df[1]) & (df[0] > df[2])):
        return 'Sexism'
    elif ((df[1] >df[0]) & (df[1] > df[2])):
        return 'Racism'
    else:
        return 'none'


dataframe_lstm['predClass'] = dataframe_lstm.apply(lambda row: get_label(row), axis=1)
dataframe_bi['predClass'] = dataframe_bi.apply(lambda row: get_label(row), axis=1)



In [29]:
print(dataframe_bi['predClass'])

0        none
1        none
2        none
3        none
4      Sexism
        ...  
321      none
322      none
323      none
324    Sexism
325    Sexism
Name: predClass, Length: 326, dtype: object


In [30]:
print(dataframe_lstm['predClass'])

0        none
1        none
2        none
3        none
4      Sexism
        ...  
321      none
322      none
323      none
324    Sexism
325      none
Name: predClass, Length: 326, dtype: object


__LSTM score__

In [31]:
score_lstm=precision_recall_fscore_support(test_data['Class'],dataframe_lstm['predClass'],average='weighted')
print(score_lstm)

(0.8813461367475771, 0.7975460122699386, 0.8373546772541526, None)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


__Bidirectional LSTM score__

In [31]:
score_bi=precision_recall_fscore_support(test_data['Class'],dataframe_bi['predClass'],average='weighted')
print(score_bi)

(0.0, 0.0, 0.0, None)


__precision recall and f1 scores for svm and deep learning models__:


In [32]:
summary = [[score_svm[0],score_svm[1],score_svm[2]], [score_lstm[0],score_lstm[1],score_lstm[2] ],[score_bi[0],score_bi[1],score_bi[2]]]
score=pd.DataFrame(summary, columns=["Precision", "Recall","F1-score"])
score.rename(index={0:'SVM',1:'LSTM',2:'Bidirectional LSTM'}, inplace=True)
print(score)

NameError: name 'score_bi' is not defined