In [130]:
import pandas as pd  #Imports for converting dataframe to train_dev splits
import numpy as np
from pathlib import Path
import nltk, spacy
from nltk.corpus import stopwords
from wordcloud import STOPWORDS
from collections import Counter
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.model_selection import train_test_split 

pd.options.display.float_format = '{:,.0f}'.format
in_file = Path.cwd().parents[0] / 'Processed_datasets' / 'final_db.csv'
in_file2 = Path.cwd().parents[0] / 'Processed_datasets' / 'mixed.csv'
my_stopwords = stopwords.words('english')
stopwords = set(STOPWORDS).union(my_stopwords) #preparing stopwards list
custom_stopwords = ['hi', '\n', '\n\n', '&amp;', ' ', '.', '-',
                    'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
nlp = spacy.load('en_core_web_sm', exclude=['ner', 'tok2vec', 'tagger', 'paerser', 'senter', 'lemmatizer', 'attribute_ruler']) # using only for stopwords
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


In [136]:
df_long = pd.read_csv(in_file, engine='python', usecols=['Text', 'oh_label'], encoding='utf-8', dtype={'Tokens' : 'string', 'Label' : 'float64'}) #not using unnecessary columns

In [137]:
print(df_long.head(15))
df_long.count()
#print(df_long['Text'].value_counts())

                                                 Text  oh_label
0   @AAlwuhaib1977 Muslim mob violence against Hin...         1
1              @Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG         0
2   @jncatron @isra_jourisra @AMPalestine Islamoph...         1
3   Finally I'm all caught up, and that sudden dea...         0
4              @carolinesinders @herecomesfran *hugs*         0
5   Please, PLEASE start using "is your discernmen...         0
6   @aymannathem As soon as ISIS chased all the mi...         0
7   @Ali_Gharib @MaxBlumenthal Glad you like it. h...         0
8   @HuffPostRelig Islam invaded and conquered 2/3...         1
9   @semzyxx Do you approve of your pedophile prop...         1
10  @watan71969 @geeky_zekey Problem with vile Mus...         1
11  @Skawtnyc @athenahollow @twoscooters i don't t...         0
12  @dylanw that's cool. next time when a woman ta...         0
13  RT @hadi_elis: Erdogan's Egyptian Nightmare \r...         0
14  RT @mykitchenrules: Our judges are a

Text        25596
oh_label    25596
dtype: int64

In [138]:
labels = df_long.iloc[:, 1].values
for l in labels[:10]:
    print(l)
    print(type(l))
toks = df_long.iloc[:, 0].values
for l in toks[:10]:
    print(l)
    print(type(l))

1.0
<class 'numpy.float64'>
0.0
<class 'numpy.float64'>
1.0
<class 'numpy.float64'>
0.0
<class 'numpy.float64'>
0.0
<class 'numpy.float64'>
0.0
<class 'numpy.float64'>
0.0
<class 'numpy.float64'>
0.0
<class 'numpy.float64'>
1.0
<class 'numpy.float64'>
1.0
<class 'numpy.float64'>
@AAlwuhaib1977 Muslim mob violence against Hindus in Bangladesh continues in 2014. #Islam http://t.co/C1JBWJwuRc
<class 'str'>
@Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG
<class 'str'>
@jncatron @isra_jourisra @AMPalestine Islamophobia is like the idea of Naziphobia. Islam is a religion of hate and it must be outlawed.
<class 'str'>
Finally I'm all caught up, and that sudden death cook off looks like it's gonna be intense #MKR
<class 'str'>
@carolinesinders @herecomesfran *hugs*
<class 'str'>
Please, PLEASE start using "is your discernment blunted by steroids" to mean "are you on DRUGS?" from now on. DEAD
<class 'str'>
@aymannathem As soon as ISIS chased all the minorities out of Mosul, the Sunni Arabs were happy to

In [139]:
#setting up baseline pipeline
#from nltk.tokenize import TweetTokenizer #I chose to tokenize with this, as it gets rid of @ handlers
from sklearn.feature_extraction.text import TfidfVectorizer #easy idf and stopword removal
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler #moderately imbalanced dataset so randomundersampling - approx 2:1 ratio
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import make_pipeline

In [140]:


PARAM_GRID = [{
        'tfidfvectorizer__stop_words': [None, ALL_STOP_WORDS],
        'svc__kernel' : ['sigmoid', 'rbf'], # for SVC
        #'clf__loss' : ['hinge', 'squared_hinge'],
        #'clf__activation' : ['tanh', 'relu', 'logistic'], #for MLP
        #'clf__hidden_layer_sizes' : [(10, 20, 30), (15, 25, 10), (40, 30, 25)],
        'tfidfvectorizer__use_idf' : [True, False],
    }]


def pipe(df, clf, params, standardise=True):
    if standardise == True:
        xs, ys = df['Text'].values, df['oh_label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(
            xs, ys, train_size=.85, random_state=42, stratify=ys)
        #tknzr = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=True)
        vect = TfidfVectorizer(
            max_df=.9, min_df=25, strip_accents='unicode', tokenizer=lambda x: x)
        scaler = StandardScaler(with_mean=False)
        sampler = RandomUnderSampler(random_state=42)
        pipeline = make_pipeline(vect, scaler, sampler, clf)
        grid_srch = GridSearchCV(
            estimator=pipeline, param_grid=params, refit=True, n_jobs=-1)
        grid_srch.fit(x_train, y_train)  # fit the grid_search object
        prediction = grid_srch.predict(x_dev)  # Obtain predictions and save them
        # obtain classification report of preds
        report = classification_report(y_dev, prediction, output_dict=True)
        best_est = grid_srch.best_estimator_
        print(report)
        return best_est, prediction, x_dev, y_dev
    xs, ys = df['Text'].values, df['oh_label'].values
    x_train, x_dev, y_train, y_dev = train_test_split(
         xs, ys, train_size=.85, random_state=42, stratify=ys)
    #tknzr = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=True)
    vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode', tokenizer=lambda x: x)
    sampler = RandomUnderSampler(random_state=42)
    pipeline = make_pipeline(vect, sampler, clf)
    grid_srch = GridSearchCV(
        estimator=pipeline, param_grid=params, refit=True, n_jobs=-1)
    fitted = grid_srch.fit(x_train, y_train)  # fit the grid_search object
    prediction = grid_srch.predict(x_dev)  # Obtain predictions and save them
    # obtain classification report of preds
    report = classification_report(y_dev, prediction, output_dict=True)
    best_est = fitted.best_estimator_
    print(report)
    return best_est, prediction, x_dev, y_dev, x_train, y_train


In [None]:
from sklearn.neural_network import MLPClassifier
tup_MLP = make_pipeline(df_long, MLPClassifier(random_state=42), PARAM_GRID)

In [62]:
tup_svr = make_pipeline(df_long, SVR(), PARAM_GRID)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [17]:
tup_linear_svc = make_pipeline(df_long, LinearSVC(random_state=42), PARAM_GRID)

  'stop_words.' % sorted(inconsistent))


{'0': {'precision': 0.8294172932330827, 'recall': 0.7731055628558914, 'f1-score': 0.8002720471548402, 'support': 2283}, '1': {'precision': 0.5953125, 'recall': 0.6773333333333333, 'f1-score': 0.6336798336798337, 'support': 1125}, 'accuracy': 0.7414906103286385, 'macro avg': {'precision': 0.7123648966165413, 'recall': 0.7252194480946124, 'f1-score': 0.716975940417337, 'support': 3408}, 'weighted avg': {'precision': 0.7521379820865985, 'recall': 0.7414906103286385, 'f1-score': 0.7452790189390589, 'support': 3408}}




In [16]:
tup_linear_svc_standardised = make_pipeline(df_long, LinearSVC(random_state=42), PARAM_GRID, standardise=False)

  'stop_words.' % sorted(inconsistent))


{'0': {'precision': 0.8294172932330827, 'recall': 0.7731055628558914, 'f1-score': 0.8002720471548402, 'support': 2283}, '1': {'precision': 0.5953125, 'recall': 0.6773333333333333, 'f1-score': 0.6336798336798337, 'support': 1125}, 'accuracy': 0.7414906103286385, 'macro avg': {'precision': 0.7123648966165413, 'recall': 0.7252194480946124, 'f1-score': 0.716975940417337, 'support': 3408}, 'weighted avg': {'precision': 0.7521379820865985, 'recall': 0.7414906103286385, 'f1-score': 0.7452790189390589, 'support': 3408}}




In [None]:
tup_svc = pipe(df_long, SVC(random_state=42, probability=True), PARAM_GRID, standardise=True)


In [25]:
# scores from above macro avg': {'precision': 0.6366069808850666, 'recall': 0.6505482579709425, 'f1-score': 0.6387982413691843, 'support': 3797
best_est = tup_svc[0]
best_clf = best_est[-1]
devset_x, devset_y = tup_svc[2], tup_svc[3] #accessing examples from the dataset for LIME
print(best_est, best_clf)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.9, min_df=25, strip_accents='unicode',
                                 tokenizer=<function pipe.<locals>.<lambda> at 0x00000256D2075EE8>)),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('randomundersampler', RandomUnderSampler(random_state=42)),
                ('svc', SVC(probability=True, random_state=42))]) SVC(probability=True, random_state=42)


############################## LIME ###################
Need explainer object, instance of test data = tuple_of_generators[1][whichever dataset performed best], instance of the classifier rf = pull from list tup_bots_forst[0][whichever dataset peformed best]

In [105]:
print(len(devset_y))
print(len(devset_x))
#print(best_clf.predict_proba(['Asshole', 'hope', 'you', "/'" , 're', 'the', 'worst', 'among', 'us', '!', 'fuck', 'your', 'religion']))
print(best_est)
print(best_est.classes_)


3797
3797
Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.9, min_df=25, strip_accents='unicode',
                                 tokenizer=<function pipe.<locals>.<lambda> at 0x00000256D2075EE8>)),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('randomundersampler', RandomUnderSampler(random_state=42)),
                ('svc', SVC(probability=True, random_state=42))])
[0. 1.]


In [37]:
from lime.lime_text import LimeTextExplainer

In [128]:
#print(tup_svc[2][2334])
#print(tup_svc[3][2334])


print(devset_x[10])

def use_lime(est, dev_x, dev_y):
    idx = 2830
#    class_names = [0, 1]
    tweet = dev_x[idx]
    label = dev_y[idx]
    print(tweet)
    print(type(tweet), type(label))
    explainer = LimeTextExplainer()
    exp = explainer.explain_instance(tweet, est.predict_proba, num_features=6)
    print('predicted probability: %d ' % est.predict_proba([tweet])[0, 1])#% est.predict_proba([tweet])[1, 1])
    print('true class: %d ' % label)
    print('tweet: %s' % tweet)
    return exp.as_list()

['sure', "i'm", 'speak', 'year', 'first', 'conference', 'ever', 'present', '👍']


In [129]:
use_lime(best_est, devset_x, devset_y)

['rt', 'eat', 'army', 'food', 'tin', 'look', 'better', 'less', 'like', 'dick', '#mkr']
<class 'str'> <class 'numpy.float64'>
predicted probability: 0 
true class: 0 
tweet: ['rt', 'eat', 'army', 'food', 'tin', 'look', 'better', 'less', 'like', 'dick', '#mkr']


[('less', 0.07492011761675238),
 ('like', 0.06401058769513568),
 ('tin', 0.05888178161450022),
 ('dick', 0.033709179581919047),
 ('army', -0.03204422228664202),
 ('mkr', -0.029460393139246824)]

In [82]:
def recompute(clf, tup_5, explainer_instance):
    """Recompute by retraining a classifier on the same trainingset -> takes an untrained clf and the entire tuple - returns explainer              instance"""
    vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode', tokenizer=lambda x: x)
    sampler = RandomUnderSampler(random_state=42)
    scaler = StandardScaler(with_mean=False)
    train_x, train_y = train_test_split(vect.fit_transform(tup_5[3], tup_5[4], train_size=.85, random_state=42, stratify=tup_5[4]))
    #vect, sampler, scaler, 
    pass

In [42]:
print(ALL_STOP_WORDS)

{'thus', 'latter', "that'll", 'i', 'hereafter', '’s', '‘ve', "she's", 'again', 'on', 'already', "mustn't", 'enough', 'why', "we'd", 'full', 'ain', 'had', 'how', 'been', 'just', 'less', 'whole', 'almost', 'keep', 'seems', '\n', 'then', 'beside', "can't", 'everywhere', 'six', 'regarding', 'co', 'wherein', 'else', 'found', 'now', 'my', 've', 'un', "she'll", 'did', 'beforehand', 'ten', 'about', 'wouldn', "'d", 'besides', 'that', 'yet', "we'll", "shouldn't", 'become', 'became', 'first', 'before', 'neither', 'am', 'put', 'seemed', 'i’m', 'isn', 'former', 'if', 'front', 'him', "he's", 'each', 'into', 'both', 'sometime', 'thence', 'hasnt', 'fire', 'fifteen', 'com', 'used', 'anything', 'con', 'becomes', 'mostly', 'we', 'us', 'an', 'eg', "how's", 'weren', 'though', "you'll", '’m', 'it', "that's", 'o', 'needn', 'only', 'whatever', 'never', 'without', 'perhaps', "i'll", 'most', 'whenever', 'through', 'last', 'can', 'was', 'meanwhile', 'towards', 'unless', 'itself', 'one', 'their', 'they', 'like', 

In [12]:
#Neural Network Classifier 



In [None]:
tup_bots_MLP = bots(tuple_of_generators[0], tuple_of_generators[1], MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10,30,10,5), random_state=42, batch_size=128, max_iter=1000))

In [22]:
#Support Vector Classifier without Gridsearch


#tup_bots_svc = bots(tuple_of_generators[0], tuple_of_generators[1], SVC())


{'0': {'precision': 0.7183863460046548, 'recall': 0.8231111111111111, 'f1-score': 0.767191383595692, 'support': 1125}, '1': {'precision': 0.7929240374609782, 'recall': 0.6773333333333333, 'f1-score': 0.7305848513902207, 'support': 1125}, 'accuracy': 0.7502222222222222, 'macro avg': {'precision': 0.7556551917328165, 'recall': 0.7502222222222222, 'f1-score': 0.7488881174929563, 'support': 2250}, 'weighted avg': {'precision': 0.7556551917328165, 'recall': 0.7502222222222222, 'f1-score': 0.7488881174929563, 'support': 2250}}
<class 'dict'>
{'0': {'precision': 0.7123893805309734, 'recall': 0.8586666666666667, 'f1-score': 0.7787182587666264, 'support': 1125}, '1': {'precision': 0.8221476510067114, 'recall': 0.6533333333333333, 'f1-score': 0.7280832095096582, 'support': 1125}, 'accuracy': 0.756, 'macro avg': {'precision': 0.7672685157688424, 'recall': 0.756, 'f1-score': 0.7534007341381423, 'support': 2250}, 'weighted avg': {'precision': 0.7672685157688424, 'recall': 0.756, 'f1-score': 0.75340

  _warn_prf(average, modifier, msg_start, len(result))


{'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 1125}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1125}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 2250}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 2250}}
<class 'dict'>
{'0': {'precision': 0.7105064247921391, 'recall': 0.8355555555555556, 'f1-score': 0.7679738562091503, 'support': 1125}, '1': {'precision': 0.8004314994606256, 'recall': 0.6595555555555556, 'f1-score': 0.7231968810916178, 'support': 1125}, 'accuracy': 0.7475555555555555, 'macro avg': {'precision': 0.7554689621263824, 'recall': 0.7475555555555555, 'f1-score': 0.745585368650384, 'support': 2250}, 'weighted avg': {'precision': 0.7554689621263824, 'recall': 0.7475555555555555, 'f1-score': 0.745585368650384, 'support': 2250}}
<class 'dict'>


In [17]:
#RandomForestClassifier without Gridsearch

from sklearn.ensemble import RandomForestClassifier


tup_bots_forest = bots(tuple_of_generators[0], tuple_of_generators[1], RandomForestClassifier(n_jobs=-1, random_state=42))

{'0': {'precision': 0.7456896551724138, 'recall': 0.7688888888888888, 'f1-score': 0.7571115973741794, 'support': 1125}, '1': {'precision': 0.7614678899082569, 'recall': 0.7377777777777778, 'f1-score': 0.7494356659142213, 'support': 1125}, 'accuracy': 0.7533333333333333, 'macro avg': {'precision': 0.7535787725403353, 'recall': 0.7533333333333333, 'f1-score': 0.7532736316442004, 'support': 2250}, 'weighted avg': {'precision': 0.7535787725403355, 'recall': 0.7533333333333333, 'f1-score': 0.7532736316442004, 'support': 2250}}
<class 'dict'>
{'0': {'precision': 0.73502722323049, 'recall': 0.72, 'f1-score': 0.7274360125729681, 'support': 1125}, '1': {'precision': 0.725609756097561, 'recall': 0.7404444444444445, 'f1-score': 0.7329520457545093, 'support': 1125}, 'accuracy': 0.7302222222222222, 'macro avg': {'precision': 0.7303184896640255, 'recall': 0.7302222222222222, 'f1-score': 0.7301940291637388, 'support': 2250}, 'weighted avg': {'precision': 0.7303184896640255, 'recall': 0.73022222222222

In [32]:
best_rf = best_clf(tup_bots_forest, 1) #best performing classifier
print(best_rf)

<__main__.ClassifierBot object at 0x0000020267B36A88>


In [30]:
print(tup_bots_forest[0])

<list_iterator object at 0x0000020267C8BB88>


In [27]:
def hi(r):
    lst = [1, 2, 3, 4, 5, 6, 7]
    print(lst[0])
    new_lst = []
    for i in range(0, r):
        clf = lst.pop()
        new_lst.append(clf)
    return new_lst

In [28]:
print(hi(3))
print(hi(0))
print(hi(1))

1
[7, 6, 5]
1
[]
1
[7]


In [46]:
#classification report
from sklearn.metrics import accuracy_score,recall_score,precision_score,recall_score,f1_score
import warnings
warnings.filterwarnings('ignore')

def report(prediction, real_labels, data):
    print("classification report as follows: ")
    print(f'   Accuracy: {accuracy_score(prediction, real_labels)}')
    print(f'   Precision: {precision_score(prediction, real_labels,average="macro")}')
    print(f'   recall: {recall_score(prediction, real_labels,average="macro")}')
    print(f'   F1 measure: {f1_score(prediction, real_labels,average="macro")}')
    print('Show 5 example of correctly classified datapoint: ')
    if data[prediction==real_labels].shape[0] > 5:
        display(data[prediction==real_labels].iloc[:5,:])
    else:
        display(data[prediction==real_labels])
    print('Show 5 example of wrongly classified datapoint: ')
    if data[prediction!=real_labels].shape[0] > 5:
        display(data[prediction!=real_labels].iloc[:5,:])
    else:
        display(data[prediction!=real_labels])

In [132]:
val_x = vectorizer.transform(validation_set['Tokens'])
val_x = transformer.transform(val_x)

predict = ntwk.predict(val_x)
report(predict, validation_set['label'], validation_set[['Tokens','Label']])

classification report as follows: 
   Accuracy: 0.8168943476626144
   Precision: 0.7084592624109877
   recall: 0.7811589138333501
   F1 measure: 0.7309291098045785
Show 5 example of correctly classified datapoint: 


Unnamed: 0,Tokens,Label
23121,rt strategic vote kat food truly awful #mkr,0
35928,pancakes proof deity love us #mkr,0
45316,sick see fuck asshole bitch make chain latters...,1
42230,i'm try get insight trans issue definitely gro...,0
40128,make,0


Show 5 example of wrongly classified datapoint: 


Unnamed: 0,Tokens,Label
48658,lol ralph guy still moi era,1
30437,kill muslims oppose kill ezidis christians non...,0
28396,single men cannot adopt,1
44924,muslim brotherhood usa hundred years liken say...,0
7346,rt #mosul christian pastor #paul_jacob sentenc...,1
