In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import nltk, spacy
from nltk.corpus import stopwords
from wordcloud import STOPWORDS
from collections import Counter
from gensim.parsing.preprocessing import STOPWORDS as SW

pd.options.display.float_format = '{:,.0f}'.format

In [4]:
in_file = Path.cwd().parents[0] / 'Processed_datasets' / 'tweets_long_no_duplicates.csv'
my_stopwords = stopwords.words('english')
stopwords = set(STOPWORDS).union(my_stopwords) #preparing stopwards list
custom_stopwords = ['hi', '\n', '\n\n', '&amp;', ' ', '.', '-',
                    'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
nlp = spacy.load('en_core_web_sm', exclude=['ner', 'tok2vec', 'tagger', 'paerser', 'senter', 'lemmatizer', 'attribute_ruler']) # using only for stopwords
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)

In [5]:
df_long = pd.read_csv(in_file, engine='python', usecols=['Tokens', 'Label']) #not using unnecessary columns

In [6]:
print(df_long.head(15))
df_long.count()
print(df_long['Label'].value_counts())

                                               Tokens  Label
0   muslim mob violence against hindus in banglade...      1
1   islamophobia is like the idea of naziphobia is...      1
2   finally all caught up and that sudden death co...      0
3   please please start using is your discernment ...      0
4   as soon as isis chased all the minorities out ...      0
5   islam invaded and conquered of christiandom be...      1
6   do you approve of your pedophile prophet rapin...      1
7   problem with vile muslims is that they try to ...      1
8            tend to talk about it much personal info      0
9   cool next time when a woman talks to him about...      0
10  our judges are about to turn the heat up in th...      0
11  lol this you walk by putting one foot in front...      0
12  said wanted sorbet now and they should tell us...      0
13  this fucking potato is blowing my mind duck fa...      0
14  omg this churner feels like razor blades on my...      0
0    15214
1     7500
Na

In [7]:
#setting up baseline pipeline
from nltk.tokenize import TweetTokenizer #I chose to tokenize with this, as it gets rid of @ handlers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler


In [80]:
from sklearn.pipeline import make_pipeline

In [8]:
def make_dataset(d_frame, stopwords=True, tfidf=True, train=True): #function for creating dataset, paramaters for flexibility
    """Helper function for construction the classes later on - accepts a dataframe 
    as an input and 3 booleans and ouputs a tuple of either the test or dev set
    The code block essentially repeates itself with minor argument tweaks
    When stopwords=True then stopwords are removed, when tfidf=True then tfidf     is implemented and when train-True then train set is returned"""
    df = d_frame
    if stopwords and tfidf and train:                                                       #all included
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train # x_train is sparse matrix, y_train is array
    elif not stopwords and tfidf and train: #no stopwords
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25,strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_train = RandomUnderSampler(random_state=42) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train    
    elif stopwords and not tfidf and train: #no tfidf
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25,
                       strip_accents='unicode', tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_train = RandomUnderSampler(random_state=42) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif not stopwords and not tfidf and train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25,
                       strip_accents='unicode', tokenizer=tknzr.tokenize)
        sampler_train = RandomUnderSampler(random_state=42) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif stopwords and tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25,
                       strip_accents='unicode', tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_dev = RandomUnderSampler(random_state=50) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_dev, y_dev = sampler_dev.fit_resample(x_dev, y_dev)
        return x_dev, y_dev
    elif not stopwords and tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25,strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_dev = RandomUnderSampler(random_state=50) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_dev, y_dev = sampler_dev.fit_resample(x_dev, y_dev)
        return x_dev, y_dev
    elif stopwords and not tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25,strip_accents='unicode',
                               tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_dev = RandomUnderSampler(random_state=50) 
        xs, ys = vect.fit_transform(df['Tokens'].values), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_dev, y_dev = sampler_dev.fit_resample(x_dev, y_dev)
        return x_dev, y_dev
    else:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25,strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_dev = RandomUnderSampler(random_state=50) 
        xs, ys = vect.fit_transform(df_long['Tokens'].values), df_long['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85,
                                                          random_state=42, stratify=ys)
        x_dev, y_dev = sampler_dev.fit_resample(x_dev, y_dev)
        return x_dev, y_dev    


        
def make_scaled_dataset(df, stopwords=True, tfidf=True, train=True):
    '''Same as above function except with added functionality of StandardScalar'''
    if stopwords and tfidf and train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        scalar = StandardScaler(with_mean=False)
        xs, ys = scalar.fit_transform(vect.fit_transform(df['Tokens'].values)), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif stopwords and tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        scalar = StandardScaler(with_mean=False)
        xs, ys = scalar.fit_transform(vect.fit_transform(df['Tokens'].values)), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_dev, y_dev = sampler_train.fit_resample(x_dev, y_dev) 
        return x_dev, y_dev
    elif not stopwords and tfidf and train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        scalar = StandardScaler(with_mean=False)
        xs, ys = scalar.fit_transform(vect.fit_transform(df['Tokens'].values)), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif not stopwords and tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        scalar = StandardScaler(with_mean=False)
        xs, ys = scalar.fit_transform(vect.fit_transform(df['Tokens'].values)), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_dev, y_dev = sampler_train.fit_resample(x_dev, y_dev) 
        return x_dev, y_dev
    elif stopwords and not tfidf and train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        scalar = StandardScaler(with_mean=False)
        xs, ys = scalar.fit_transform(vect.fit_transform(df['Tokens'].values)), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif stopwords and not tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize, stop_words=ALL_STOP_WORDS)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        scalar = StandardScaler(with_mean=False)
        xs, ys = scalar.fit_transform(vect.fit_transform(df['Tokens'].values)), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_dev, y_dev = sampler_train.fit_resample(x_dev, y_dev) 
        return x_dev, y_dev
    elif not stopwords and not tfidf and train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        scalar = StandardScaler(with_mean=False)
        xs, ys = scalar.fit_transform(vect.fit_transform(df['Tokens'].values)), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_train, y_train = sampler_train.fit_resample(x_train, y_train) 
        return x_train, y_train
    elif not stopwords and not tfidf and not train:
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = CountVectorizer(max_df=.9, min_df=25, strip_accents='unicode',
                               tokenizer=tknzr.tokenize)
        sampler_train = RandomUnderSampler(random_state=42) #undersampling  object to rebalance dataset
        scalar = StandardScaler(with_mean=False)
        xs, ys = scalar.fit_transform(vect.fit_transform(df['Tokens'].values)), df['Label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(xs, ys, train_size=.85, #creating train and dev sets with stratified sampling
                                                          random_state=42, stratify=ys) 
        x_dev, y_dev = sampler_train.fit_resample(x_dev, y_dev) 
        return x_dev, y_dev

def make_dataset_iterators(df):
    """Creates a tuple of generators. First index holds all train sets while second holds all dev sets.
        Makes running the multiple datasets through a model more convenient """
    all_param_train = make_dataset(df)  # non scaled variation of the dataset
    # dev set counterpart of above
    all_param_dev = make_dataset(df, train=False)
    stopwords_no_tfidf_train = make_dataset(df, tfidf=False)
    stopwords_no_tfidf_dev = make_dataset(
        df, tfidf=False, train=False)  # devset counterpart of above
    no_stop_train = make_dataset(df, stopwords=False)
    no_stop_dev = make_dataset(df, stopwords=False, train=False)
    no_stopwords_no_tfidf_train = make_dataset(
        df, stopwords=False, tfidf=False)
    no_stopwords_no_tfidf_dev = make_dataset(
        df, stopwords=False, tfidf=False, train=False)

    scaled_all_param_train = make_scaled_dataset(df) 
    scaled_all_param_dev = make_scaled_dataset(df, train=False)
    scaled_stop_no_tfidf_train = make_scaled_dataset(df, tfidf=False)
    scaled_stop_no_tfidf_dev = make_scaled_dataset(
        df, tfidf=False, train=False)
    scaled_no_stop_tfidf_train = make_scaled_dataset(df, stopwords=False)
    scaled_no_stop_tfidf_dev = make_dataset(df, stopwords=False, train=False)
    scaled_no_stop_no_tfidf_train = make_dataset(
        df, stopwords=False, tfidf=False)
    scaled_no_stop_no_tfidf_dev = make_dataset(
        df, stopwords=False, tfidf=False, train=False)

    list_train_sets = [all_param_train, stopwords_no_tfidf_train, no_stop_train, no_stopwords_no_tfidf_train, scaled_all_param_train,       #make lists of previously created train                                                                                                                                                  datasets
                       scaled_stop_no_tfidf_train, scaled_no_stop_no_tfidf_train, scaled_no_stop_tfidf_train, scaled_no_stop_no_tfidf_train] # make lists of previously created dev sets
    list_dev_sets = [all_param_dev, stopwords_no_tfidf_dev, no_stop_dev, no_stopwords_no_tfidf_dev, scaled_all_param_dev,
                     scaled_stop_no_tfidf_dev, scaled_no_stop_no_tfidf_dev, scaled_no_stop_tfidf_dev, scaled_no_stop_no_tfidf_dev]

    train_gen, dev_gen = iter(list_train_sets), iter(list_dev_sets) #convert them both to iterators to save space 
    return train_gen, dev_gen





In [89]:
tuple_of_generators = make_dataset_iterators(df_long)
print(type(tuple_of_generators))
print(tuple_of_generators[0])
print(tuple_of_generators[1])

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


<class 'tuple'>
<list_iterator object at 0x000002026AA0D2C8>
<list_iterator object at 0x000002026A68DD88>


In [79]:
print(next(tuple_of_generators[1]))

(<2250x1454 sparse matrix of type '<class 'numpy.float64'>'
	with 11944 stored elements in Compressed Sparse Row format>, array([0, 0, 0, ..., 1, 1, 1], dtype=int64))


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

In [None]:
class ClassifierBot():
    
    def __init__(self, trainset, devset, bot):
        self.train = trainset
        self.dev = devset
        self.bot = bot
        
    def classify(self):
        fitted_bot = self.bot.fit(self.train[0], self.train[1])
        prediction = fitted_bot.predict(self.dev[0])
        report = classification_report(self.dev[1], prediction, output_dict=True)
        print(report) # print statement to easily see the performance of each classifier
        return fitted_bot, prediction


In [74]:
def bots(iter_train, iter_dev, clf):
    tup_lst, dev_lst = [], [] # lst for storing
    for i in range(0, 9): #iterating through all 9 datasets
        current_dev_set = next(iter_dev) #storing the devset sequence for reuse with LIME
        print(type(current_dev_set))
        bot = ClassifierBot(next(iter_train), current_dev_set, clf)
        bot_pred_tup = bot.classify()  
        #print(type(bot_pred_tup))
        tup_lst.append(bot_pred_tup)
        dev_lst.append(current_dev_set)
    return iter(tup_lst), iter(dev_lst)


In [88]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline


def create_pipeline(clf, dataset):
    """Function for Pipeline with whatever classifier we want"""
    temp_lst = []
    for i in range(0, 2):
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(
            max_df=.9, min_df=25, strip_accents='unicode', tokenizer=tknzr.tokenize)
        scaler = StandardScaler(with_mean=False)
        sampler = RandomUnderSampler(random_state=42)
        pipeline = Pipeline([('vect', vect), ('scale', scaler),
                             ('sampler', sampler), ('clf', clf)])
        param_grid = [{
            'vect__stop_words': [None, ALL_STOP_WORDS]
        }]
        grid_srch = GridSearchCV(
            estimator=pipeline, param_grid=param_grid, refit=True, n_jobs=-1)
        current_dev_set, current_train_set = next(dataset[1]), next(dataset[0])
        grid_srch.fit(current_train_set[0], current_train_set[1])
        prediction = grid_srch.predict(current_dev_set[0])
        report = classification_report(
            current_dev_set[1], prediction, output_dict=True)
        best_est = grid_srch.best_estimator_
        tup = (best_est, prediction)
        temp_lst.append(tup)
    return temp_lst


In [90]:
tup_sgd = create_pipeline(SVC(random_state=42), tuple_of_generators)



AttributeError: lower not found

In [77]:
tup_bots_sgd = bots(tuple_of_generators[0], tuple_of_generators[1], SGDClassifier(random_state=42))

<class 'tuple'>
{'0': {'precision': 0.7259430331023865, 'recall': 0.8382222222222222, 'f1-score': 0.778052805280528, 'support': 1125}, '1': {'precision': 0.8086225026288117, 'recall': 0.6835555555555556, 'f1-score': 0.7408477842003853, 'support': 1125}, 'accuracy': 0.7608888888888888, 'macro avg': {'precision': 0.7672827678655991, 'recall': 0.760888888888889, 'f1-score': 0.7594502947404567, 'support': 2250}, 'weighted avg': {'precision': 0.7672827678655991, 'recall': 0.7608888888888888, 'f1-score': 0.7594502947404567, 'support': 2250}}
<class 'tuple'>
{'0': {'precision': 0.7126168224299065, 'recall': 0.8133333333333334, 'f1-score': 0.759651307596513, 'support': 1125}, '1': {'precision': 0.782608695652174, 'recall': 0.672, 'f1-score': 0.7230989956958394, 'support': 1125}, 'accuracy': 0.7426666666666667, 'macro avg': {'precision': 0.7476127590410402, 'recall': 0.7426666666666667, 'f1-score': 0.7413751516461762, 'support': 2250}, 'weighted avg': {'precision': 0.7476127590410403, 'recall':

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
#Quickly Verifying whether we are accessing the correct objects

print(type(tup_bots_sgd))
print(type(tup_bots_sgd[1]))
for i in range(0, 2):
    print(type(next(tup_bots_sgd[1])))
#for i in range(0, 2):
#    tup = next(tup_bots_sgd)
#    print(type(tup[0]), type(tup[1]))


<class 'tuple'>
<class 'list_iterator'>
<class 'tuple'>
<class 'tuple'>


In [12]:
#Neural Network Classifier 
from sklearn.neural_network import MLPClassifier


In [None]:
tup_bots_MLP = bots(tuple_of_generators[0], tuple_of_generators[1], MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10,30,10,5), random_state=42, batch_size=128, max_iter=1000))

In [22]:
#Support Vector Classifier without Gridsearch


#tup_bots_svc = bots(tuple_of_generators[0], tuple_of_generators[1], SVC())


{'0': {'precision': 0.7183863460046548, 'recall': 0.8231111111111111, 'f1-score': 0.767191383595692, 'support': 1125}, '1': {'precision': 0.7929240374609782, 'recall': 0.6773333333333333, 'f1-score': 0.7305848513902207, 'support': 1125}, 'accuracy': 0.7502222222222222, 'macro avg': {'precision': 0.7556551917328165, 'recall': 0.7502222222222222, 'f1-score': 0.7488881174929563, 'support': 2250}, 'weighted avg': {'precision': 0.7556551917328165, 'recall': 0.7502222222222222, 'f1-score': 0.7488881174929563, 'support': 2250}}
<class 'dict'>
{'0': {'precision': 0.7123893805309734, 'recall': 0.8586666666666667, 'f1-score': 0.7787182587666264, 'support': 1125}, '1': {'precision': 0.8221476510067114, 'recall': 0.6533333333333333, 'f1-score': 0.7280832095096582, 'support': 1125}, 'accuracy': 0.756, 'macro avg': {'precision': 0.7672685157688424, 'recall': 0.756, 'f1-score': 0.7534007341381423, 'support': 2250}, 'weighted avg': {'precision': 0.7672685157688424, 'recall': 0.756, 'f1-score': 0.75340

  _warn_prf(average, modifier, msg_start, len(result))


{'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 1125}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1125}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 2250}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 2250}}
<class 'dict'>
{'0': {'precision': 0.7105064247921391, 'recall': 0.8355555555555556, 'f1-score': 0.7679738562091503, 'support': 1125}, '1': {'precision': 0.8004314994606256, 'recall': 0.6595555555555556, 'f1-score': 0.7231968810916178, 'support': 1125}, 'accuracy': 0.7475555555555555, 'macro avg': {'precision': 0.7554689621263824, 'recall': 0.7475555555555555, 'f1-score': 0.745585368650384, 'support': 2250}, 'weighted avg': {'precision': 0.7554689621263824, 'recall': 0.7475555555555555, 'f1-score': 0.745585368650384, 'support': 2250}}
<class 'dict'>


In [17]:
#RandomForestClassifier without Gridsearch

from sklearn.ensemble import RandomForestClassifier


tup_bots_forest = bots(tuple_of_generators[0], tuple_of_generators[1], RandomForestClassifier(n_jobs=-1, random_state=42))

{'0': {'precision': 0.7456896551724138, 'recall': 0.7688888888888888, 'f1-score': 0.7571115973741794, 'support': 1125}, '1': {'precision': 0.7614678899082569, 'recall': 0.7377777777777778, 'f1-score': 0.7494356659142213, 'support': 1125}, 'accuracy': 0.7533333333333333, 'macro avg': {'precision': 0.7535787725403353, 'recall': 0.7533333333333333, 'f1-score': 0.7532736316442004, 'support': 2250}, 'weighted avg': {'precision': 0.7535787725403355, 'recall': 0.7533333333333333, 'f1-score': 0.7532736316442004, 'support': 2250}}
<class 'dict'>
{'0': {'precision': 0.73502722323049, 'recall': 0.72, 'f1-score': 0.7274360125729681, 'support': 1125}, '1': {'precision': 0.725609756097561, 'recall': 0.7404444444444445, 'f1-score': 0.7329520457545093, 'support': 1125}, 'accuracy': 0.7302222222222222, 'macro avg': {'precision': 0.7303184896640255, 'recall': 0.7302222222222222, 'f1-score': 0.7301940291637388, 'support': 2250}, 'weighted avg': {'precision': 0.7303184896640255, 'recall': 0.73022222222222

############################## LIME ###################
Need explainer object, instance of test data = tuple_of_generators[1][whichever dataset performed best], instance of the classifier rf = pull from list tup_bots_forst[0][whichever dataset peformed best]

In [41]:
from lime.lime_text import LimeTextExplainer
class_names = ['Hateful', 'Non-Hateful']
exp = LimeTextExplainer(class_names=class_names)

def best_val_set(tup, r):
    pass

def best_clf(tup, r):
    """Input tuple of (classifiers_generator, predictions_generator) and index 
    of classifier to extract from tuple -> outputs desired classifier"""
    lst_of_classifiers = [] # empty list for storing
    for i in range(0, r): # iteration for looping through sequence
        classifier = next(tup[0]) # access each classifier object
        lst_of_classifiers.append(classifier) #append it to empty list
    return lst_of_classifiers.pop() #pop out the final entry to the list - desired classifier

In [32]:
best_rf = best_clf(tup_bots_forest, 1) #best performing classifier
print(best_rf)

<__main__.ClassifierBot object at 0x0000020267B36A88>


In [30]:
print(tup_bots_forest[0])

<list_iterator object at 0x0000020267C8BB88>


In [27]:
def hi(r):
    lst = [1, 2, 3, 4, 5, 6, 7]
    print(lst[0])
    new_lst = []
    for i in range(0, r):
        clf = lst.pop()
        new_lst.append(clf)
    return new_lst

In [28]:
print(hi(3))
print(hi(0))
print(hi(1))

1
[7, 6, 5]
1
[]
1
[7]


In [46]:
#classification report
from sklearn.metrics import accuracy_score,recall_score,precision_score,recall_score,f1_score
import warnings
warnings.filterwarnings('ignore')

def report(prediction, real_labels, data):
    print("classification report as follows: ")
    print(f'   Accuracy: {accuracy_score(prediction, real_labels)}')
    print(f'   Precision: {precision_score(prediction, real_labels,average="macro")}')
    print(f'   recall: {recall_score(prediction, real_labels,average="macro")}')
    print(f'   F1 measure: {f1_score(prediction, real_labels,average="macro")}')
    print('Show 5 example of correctly classified datapoint: ')
    if data[prediction==real_labels].shape[0] > 5:
        display(data[prediction==real_labels].iloc[:5,:])
    else:
        display(data[prediction==real_labels])
    print('Show 5 example of wrongly classified datapoint: ')
    if data[prediction!=real_labels].shape[0] > 5:
        display(data[prediction!=real_labels].iloc[:5,:])
    else:
        display(data[prediction!=real_labels])

In [132]:
val_x = vectorizer.transform(validation_set['Tokens'])
val_x = transformer.transform(val_x)

predict = ntwk.predict(val_x)
report(predict, validation_set['label'], validation_set[['Tokens','Label']])

classification report as follows: 
   Accuracy: 0.8168943476626144
   Precision: 0.7084592624109877
   recall: 0.7811589138333501
   F1 measure: 0.7309291098045785
Show 5 example of correctly classified datapoint: 


Unnamed: 0,Tokens,Label
23121,rt strategic vote kat food truly awful #mkr,0
35928,pancakes proof deity love us #mkr,0
45316,sick see fuck asshole bitch make chain latters...,1
42230,i'm try get insight trans issue definitely gro...,0
40128,make,0


Show 5 example of wrongly classified datapoint: 


Unnamed: 0,Tokens,Label
48658,lol ralph guy still moi era,1
30437,kill muslims oppose kill ezidis christians non...,0
28396,single men cannot adopt,1
44924,muslim brotherhood usa hundred years liken say...,0
7346,rt #mosul christian pastor #paul_jacob sentenc...,1
