In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import nltk, spacy
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS as SW
from wordcloud import STOPWORDS
from collections import Counter
import enum

pd.options.display.float_format = '{:,.0f}'.format

In [3]:
in_file = Path.cwd().parents[0] / 'Processed_datasets' / 'tweets_long_no_duplicates.csv'
my_stopwords = stopwords.words('english')
stopwords = set(STOPWORDS).union(my_stopwords) #preparing stopwards list
custom_stopwords = ['hi', '\n', '\n\n', '&amp;', ' ', '.', '-',
                    'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
nlp = spacy.load('en_core_web_sm', exclude=['ner', 'tok2vec', 'tagger', 'paerser', 'senter', 'lemmatizer', 'attribute_ruler']) # using only for stopwords
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


In [4]:
df_long = pd.read_csv(in_file, engine='python', usecols=['Tokens', 'Label']) #not using unnecessary columns

In [5]:
print(df_long.head(15))
df_long.count()
print(df_long['Label'].value_counts())

                                               Tokens  Label
0   muslim mob violence against hindus in banglade...      1
1   islamophobia is like the idea of naziphobia is...      1
2   finally all caught up and that sudden death co...      0
3   please please start using is your discernment ...      0
4   as soon as isis chased all the minorities out ...      0
5   islam invaded and conquered of christiandom be...      1
6   do you approve of your pedophile prophet rapin...      1
7   problem with vile muslims is that they try to ...      1
8            tend to talk about it much personal info      0
9   cool next time when a woman talks to him about...      0
10  our judges are about to turn the heat up in th...      0
11  lol this you walk by putting one foot in front...      0
12  said wanted sorbet now and they should tell us...      0
13  this fucking potato is blowing my mind duck fa...      0
14  omg this churner feels like razor blades on my...      0
0    15214
1     7500
Na

In [6]:
#setting up baseline pipeline
from nltk.tokenize import TweetTokenizer #I chose to tokenize with this, as it gets rid of @ handlers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler


#tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
#vect = TfidfVectorizer(max_df=.9, min_df=25,
#                       strip_accents='unicode', tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
#sampler_train = RandomUnderSampler(random_state=42) #undersampling to rebalance dataset
#sampler_dev = RandomUnderSampler(random_state=50) #different samplers for each dataset
#xs, ys = vect.fit_transform(df_long['Tokens'].values), df_long['Label'].values
#x_train, x_dev, y_train, y_dev = train_test_split(
#    xs, ys, train_size=.85, random_state=42, stratify=ys)
#x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
#x_dev, y_dev = sampler_dev.fit_resample(x_train, y_train)
#print(Counter(y_train), Counter(y_dev)) #checking how many labels of each class are in each set 


In [39]:
def make_dataset(d_frame, stopwords=True, tfidf=True, train=True): #function for creating dataset, paramaters for flexibility
    """Helper function for construction the classes later on - accepts a dataframe 
    as an input and 3 boolean and ouputs a tuple of either the test or dev set
    The code block essentially repeates itself with minor argument tweaks"""
    df = d_frame
    if stopwords and tfidf and train:                                                       #all included
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_train = RandomUnderSampler(random_state=42)                                  #undersampling  object to rebalance dataset
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif not stopwords and tfidf and train: #no stopwords
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25,strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_train = RandomUnderSampler(random_state=42) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train    
    elif stopwords and not tfidf and train: #no tfidf
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25,
                       strip_accents='unicode', tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_train = RandomUnderSampler(random_state=42) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif not stopwords and not tfidf and train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25,
                       strip_accents='unicode', tokenizer=tknzr.tokenize)
        sampler_train = RandomUnderSampler(random_state=42) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif stopwords and tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25,
                       strip_accents='unicode', tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_dev = RandomUnderSampler(random_state=50) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_dev, y_dev = sampler_dev.fit_resample(x_train, y_train)
        return x_dev, y_dev
    elif not stopwords and tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25,strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_dev = RandomUnderSampler(random_state=50) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_dev, y_dev = sampler_dev.fit_resample(x_train, y_train)
        return x_dev, y_dev
    elif stopwords and not tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25,strip_accents='unicode',
                               tokenizer=tknzr.tokenize, stopwords=ALL_STOP_WORDS)
        sampler_dev = RandomUnderSampler(random_state=50) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_dev, y_dev = sampler_dev.fit_resample(x_train, y_train)
        return x_dev, y_dev
    else:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25,strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_dev = RandomUnderSampler(random_state=50) 
        xs, ys = vect.fit_transform(df_long['Tokens'].values), df_long['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_dev, y_dev = sampler_dev.fit_resample(x_train, y_train)
        return x_dev, y_dev    


        


        
        
    

In [40]:
#all_param_train = Dataset(df_long)
#no_param_train = Dataset(df_long, stopwords=False, tfidf=False, train=True)
#train_set = all_param_train.make_dataset()
#print(train_set[0:10])

#print(type(no_param_train.make_dataset()))

all_param_train = make_dataset(df_long)
all_param_dev = make_dataset(df_long, train=False)
no_param_train = make_dataset(df_long, stopwords=False, tfidf=False)
no_param_dev = make_dataset(df_long, stopwords=False, tfidf=False, train=False)
print(type(all_param_train), all_param_train)
print(type(all_param_dev[0]))
print(type(no_param_dev[0]))

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


<class 'tuple'> (<12750x1454 sparse matrix of type '<class 'numpy.float64'>'
	with 63969 stored elements in Compressed Sparse Row format>, array([0, 0, 0, ..., 1, 1, 1], dtype=int64))
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>


In [35]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

        

In [42]:
class ClassifierBot():
    
    def __init__(self, trainset, devset, bot):
        self.train = trainset
        self.dev = devset
        self.bot = bot
        
    def classify(self):
        self.bot.fit(self.train[0], self.train[1])
        #self.bot.predict(['islam invaded and conquered of christiandom before any christian crusades in response the writer is a liar'])
        prediction = self.bot.predict(self.dev[0])
        return classification_report(self.dev[1], prediction)
        


In [43]:
sgd_clf = SGDClassifier()
bot1 = ClassifierBot(all_param_train, all_param_dev, sgd_clf)
bot_no_params = ClassifierBot(no_param_train, no_param_dev, sgd_clf)
print(bot1.classify())
print(bot_no_params.classify())
print(sgd_clf)
print(bot1)

              precision    recall  f1-score   support

           0       0.75      0.84      0.79      6375
           1       0.82      0.72      0.77      6375

    accuracy                           0.78     12750
   macro avg       0.79      0.78      0.78     12750
weighted avg       0.79      0.78      0.78     12750

              precision    recall  f1-score   support

           0       0.77      0.85      0.81      6375
           1       0.83      0.75      0.79      6375

    accuracy                           0.80     12750
   macro avg       0.80      0.80      0.80     12750
weighted avg       0.80      0.80      0.80     12750

SGDClassifier()
<__main__.ClassifierBot object at 0x000001C0200A9F88>


In [46]:
#classification report
from sklearn.metrics import accuracy_score,recall_score,precision_score,recall_score,f1_score
import warnings
warnings.filterwarnings('ignore')

def report(prediction, real_labels, data):
    print("classification report as follows: ")
    print(f'   Accuracy: {accuracy_score(prediction, real_labels)}')
    print(f'   Precision: {precision_score(prediction, real_labels,average="macro")}')
    print(f'   recall: {recall_score(prediction, real_labels,average="macro")}')
    print(f'   F1 measure: {f1_score(prediction, real_labels,average="macro")}')
    print('Show 5 example of correctly classified datapoint: ')
    if data[prediction==real_labels].shape[0] > 5:
        display(data[prediction==real_labels].iloc[:5,:])
    else:
        display(data[prediction==real_labels])
    print('Show 5 example of wrongly classified datapoint: ')
    if data[prediction!=real_labels].shape[0] > 5:
        display(data[prediction!=real_labels].iloc[:5,:])
    else:
        display(data[prediction!=real_labels])

In [51]:
class ClassifierBot():
    
    def __init__(self, trainset, devset, bot):
        self.train = trainset
        self.dev = devset
        self.bot = bot
        
    def classify(self):
        self.bot.fit(self.train[0], self.train[1])
        #self.bot.predict(['islam invaded and conquered of christiandom before any christian crusades in response the writer is a liar'])
        prediction = self.bot.predict(self.dev[0])
        return report(prediction, self.dev[1]. self.dev)
        


In [52]:
sgd_clf = SGDClassifier()
bot1 = ClassifierBot(all_param_train, all_param_dev, sgd_clf)
bot_no_params = ClassifierBot(no_param_train, no_param_dev, sgd_clf)
print(bot1.classify())
print(bot_no_params.classify())
print(sgd_clf)
print(bot1)

AttributeError: 'numpy.ndarray' object has no attribute 'self'

In [125]:
#Baseline: logistic regression!
#Same as Question 4
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.linear_model import LogisticRegression

vectorizer=CountVectorizer(stop_words='english',max_features=100)
transformer=TfidfTransformer()

vectorizer.fit(train_set['Tokens'].to_list())
train_x = vectorizer.transform(train_set['Tokens'].to_list())

transformer.fit(train_x)
train_x = transformer.transform(train_x)

train_y = train_set['label']
lr = LogisticRegression(multi_class="multinomial", solver="newton-cg")
lr.fit(train_x, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [126]:
val_x = vectorizer.transform(validation_set['Tokens'])
val_x = transformer.transform(val_x)

predict = lr.predict(val_x)
report(predict, validation_set['label'], validation_set[['Tokens','Label']])

classification report as follows: 
   Accuracy: 0.7987216443163304
   Precision: 0.6614977192076772
   recall: 0.7667356382049939
   F1 measure: 0.6835940478433892
Show 5 example of correctly classified datapoint: 


Unnamed: 0,Tokens,Label
23121,rt strategic vote kat food truly awful #mkr,0
35928,pancakes proof deity love us #mkr,0
45316,sick see fuck asshole bitch make chain latters...,1
42230,i'm try get insight trans issue definitely gro...,0
40128,make,0


Show 5 example of wrongly classified datapoint: 


Unnamed: 0,Tokens,Label
48658,lol ralph guy still moi era,1
28396,single men cannot adopt,1
25704,shame katie nikki kat go #mkr,1
44924,muslim brotherhood usa hundred years liken say...,0
7346,rt #mosul christian pastor #paul_jacob sentenc...,1


In [127]:
##############END OF TF-IDF##################

In [128]:
#Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_x, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [129]:
val_x = vectorizer.transform(validation_set['Tokens'])
val_x = transformer.transform(val_x)

predict = clf.predict(val_x)
report(predict, validation_set['label'], validation_set[['Tokens','Label']])

classification report as follows: 
   Accuracy: 0.7894472991602958
   Precision: 0.6323198258641166
   recall: 0.7665798483965244
   F1 measure: 0.6501280393689739
Show 5 example of correctly classified datapoint: 


Unnamed: 0,Tokens,Label
23121,rt strategic vote kat food truly awful #mkr,0
35928,pancakes proof deity love us #mkr,0
42230,i'm try get insight trans issue definitely gro...,0
40128,make,0
3114,wut,0


Show 5 example of wrongly classified datapoint: 


Unnamed: 0,Tokens,Label
45316,sick see fuck asshole bitch make chain latters...,1
48658,lol ralph guy still moi era,1
28396,single men cannot adopt,1
25704,shame katie nikki kat go #mkr,1
44924,muslim brotherhood usa hundred years liken say...,0


In [130]:
#################END OF MULTINOMINAL NAIVE BAYES######

In [131]:
#Neural Network
from sklearn.neural_network import MLPClassifier
ntwk = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(10,30,10,5), random_state=1,batch_size=128,max_iter=400)
ntwk.fit(train_x, train_y)

MLPClassifier(activation='relu', alpha=1e-05, batch_size=128, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 30, 10, 5), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=400,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [132]:
val_x = vectorizer.transform(validation_set['Tokens'])
val_x = transformer.transform(val_x)

predict = ntwk.predict(val_x)
report(predict, validation_set['label'], validation_set[['Tokens','Label']])

classification report as follows: 
   Accuracy: 0.8168943476626144
   Precision: 0.7084592624109877
   recall: 0.7811589138333501
   F1 measure: 0.7309291098045785
Show 5 example of correctly classified datapoint: 


Unnamed: 0,Tokens,Label
23121,rt strategic vote kat food truly awful #mkr,0
35928,pancakes proof deity love us #mkr,0
45316,sick see fuck asshole bitch make chain latters...,1
42230,i'm try get insight trans issue definitely gro...,0
40128,make,0


Show 5 example of wrongly classified datapoint: 


Unnamed: 0,Tokens,Label
48658,lol ralph guy still moi era,1
30437,kill muslims oppose kill ezidis christians non...,0
28396,single men cannot adopt,1
44924,muslim brotherhood usa hundred years liken say...,0
7346,rt #mosul christian pastor #paul_jacob sentenc...,1
