In [21]:
import pandas as pd  #Imports for converting dataframe to train_dev splits
import numpy as np
from pathlib import Path
import nltk, spacy
from nltk.corpus import stopwords
from wordcloud import STOPWORDS
from collections import Counter
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.model_selection import train_test_split 

pd.options.display.float_format = '{:,.0f}'.format
in_file = Path.cwd().parents[0] / 'Processed_datasets' / 'final_db.csv'
my_stopwords = stopwords.words('english')
stopwords = set(STOPWORDS).union(my_stopwords) #preparing stopwards list
custom_stopwords = ['hi', '\n', '\n\n', '&amp;', ' ', '.', '-',
                    'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
nlp = spacy.load('en_core_web_sm', exclude=['ner', 'tok2vec', 'tagger', 'paerser', 'senter', 'lemmatizer', 'attribute_ruler']) # using only for stopwords
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)
CHUNK_SIZE = 1000

In [22]:
df_long = pd.read_csv(in_file, engine='python', usecols=['Text', 'oh_label'], encoding='utf-8') #not using unnecessary columns

In [23]:
print(df_long.head(15))
df_long.count()
#print(df_long['Text'].value_counts())

                                                 Text  oh_label
0   @AAlwuhaib1977 Muslim mob violence against Hin...         1
1              @Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG         0
2   @jncatron @isra_jourisra @AMPalestine Islamoph...         1
3   Finally I'm all caught up, and that sudden dea...         0
4              @carolinesinders @herecomesfran *hugs*         0
5   Please, PLEASE start using "is your discernmen...         0
6   @aymannathem As soon as ISIS chased all the mi...         0
7   @Ali_Gharib @MaxBlumenthal Glad you like it. h...         0
8   @HuffPostRelig Islam invaded and conquered 2/3...         1
9   @semzyxx Do you approve of your pedophile prop...         1
10  @watan71969 @geeky_zekey Problem with vile Mus...         1
11  @Skawtnyc @athenahollow @twoscooters i don't t...         0
12  @dylanw that's cool. next time when a woman ta...         0
13  RT @hadi_elis: Erdogan's Egyptian Nightmare \n...         0
14  RT @mykitchenrules: Our judges are a

Text        25596
oh_label    25596
dtype: int64

In [24]:
#setting up baseline pipeline
from nltk.tokenize import TweetTokenizer #I chose to tokenize with this, as it gets rid of @ handlers
from sklearn.feature_extraction.text import TfidfVectorizer #easy idf and stopword removal
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler #moderately imbalanced dataset so randomundersampling - approx 2:1 ratio
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

In [25]:


PARAM_GRID = [{
        'vect__stop_words': [None, ALL_STOP_WORDS],
        #'clf__kernel' : ['sigmoid', 'rbf'],
        #'clf__loss' : ['hinge', 'squared_hinge'],
       # 'clf__activation' : ['tanh', 'relu', 'logistic'],
        #'clf__hidden_layer_sizes' : [(10, 20, 30), (15, 25, 10), (40, 30, 25)],
        'vect__use_idf' : [True, False]
    }]


def make_pipeline(df, clf, params, standardise=True):
    if standardise == True:
        xs, ys = df['Text'].values, df['oh_label'].values
        x_train, x_dev, y_train, y_dev = train_test_split(
            xs, ys, train_size=.85, random_state=42, stratify=ys)
        tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
        vect = TfidfVectorizer(
            max_df=.9, min_df=25, strip_accents='unicode', tokenizer=tknzr.tokenize)
        scaler = StandardScaler(with_mean=False)
        sampler = RandomUnderSampler(random_state=42)
        pipeline = Pipeline([('vect', vect), ('scale', scaler),
                             ('sampler', sampler), ('clf', clf)])
        grid_srch = GridSearchCV(
            estimator=pipeline, param_grid=params, refit=True, n_jobs=-1)
        grid_srch.fit(x_train, y_train)  # fit the grid_search object
        prediction = grid_srch.predict(x_dev)  # Obtain predictions and save them
        # obtain classification report of preds
        report = classification_report(y_dev, prediction, output_dict=True)
        best_est = grid_srch.best_estimator_
        print(report)
        return best_est, prediction, x_dev, y_dev
    xs, ys = df['Text'].values, df['oh_label'].values
    x_train, x_dev, y_train, y_dev = train_test_split(
         xs, ys, train_size=.85, random_state=42, stratify=ys)
    tknzr = TweetTokenizer(preserve_case=True, reduce_len=True)
    vect = TfidfVectorizer(max_df=.9, min_df=25, strip_accents='unicode', tokenizer=tknzr.tokenize)
    scaler = StandardScaler(with_mean=False)
    sampler = RandomUnderSampler(random_state=42)
    pipeline = Pipeline([('vect', vect), ('scale', scaler),
                             ('sampler', sampler), ('clf', clf)])
    grid_srch = GridSearchCV(
        estimator=pipeline, param_grid=params, refit=True, n_jobs=-1)
    grid_srch.fit(x_train, y_train)  # fit the grid_search object
    prediction = grid_srch.predict(x_dev)  # Obtain predictions and save them
    # obtain classification report of preds
    report = classification_report(y_dev, prediction, output_dict=True)
    best_est = grid_srch.best_estimator_
    print(report)
    return best_est, prediction, x_dev, y_dev


In [10]:
from sklearn.neural_network import MLPClassifier
tup_MLP = make_pipeline(df_long, MLPClassifier(random_state=42), PARAM_GRID)



{'0.0': {'precision': 0.8696453247351451, 'recall': 0.7203357497138497, 'f1-score': 0.7879799666110184, 'support': 2621}, '1.0': {'precision': 0.560814859197124, 'recall': 0.7678424938474159, 'f1-score': 0.6481994459833795, 'support': 1219}, 'accuracy': 0.7354166666666667, 'macro avg': {'precision': 0.7152300919661345, 'recall': 0.7440891217806328, 'f1-score': 0.7180897062971989, 'support': 3840}, 'weighted avg': {'precision': 0.7716077368469035, 'recall': 0.7354166666666667, 'f1-score': 0.7436069315471925, 'support': 3840}}


In [20]:
tup_MLP

(Pipeline(steps=[('vect',
                  TfidfVectorizer(max_df=0.9, min_df=25, strip_accents='unicode',
                                  tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7f89f1ca1760>>)),
                 ('scale', StandardScaler(with_mean=False)),
                 ('sampler', RandomUnderSampler(random_state=42)),
                 ('clf',
                  MLPClassifier(activation='logistic',
                                hidden_layer_sizes=(15, 25, 10),
                                random_state=42))]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array(['.@BlackOpal80 Like, if #GamerGate really just hated women, why bother with GG?  Need NO permission to hate. They could just hate/play games.',
        '"@londonmum\\xa0 Go out for another cigarette.\\xa0 You are having nicotine withdrawal"',
        '"Respectfully, I think you\'re wrong Alain. It is genetic. If physiological traits are determined by race, then wha

In [17]:
classi = MLPClassifier

import warnings

import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
                    solver='sgd', verbose=10, random_state=1,
                    learning_rate_init=.1)

# this example won't converge because of CI's time constraints, so we catch the
# warning and are ignore it here
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning,
                            module="sklearn")
    mlp.fit(X_train, y_train)

print("Training set score: %f" % mlp.score(X_train, y_train))
print("Test set score: %f" % mlp.score(X_test, y_test))

fig, axes = plt.subplots(4, 4)
# use global min / max to ensure all weights are shown on the same scale
vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()
for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):
    ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,
               vmax=.5 * vmax)
    ax.set_xticks(())
    ax.set_yticks(())

plt.show()


ModuleNotFoundError: No module named 'VisualizeNN'

In [16]:
tup_svr = make_pipeline(df_long, SVR(), PARAM_GRID)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
tup_linear_svc = make_pipeline(df_long, LinearSVC(random_state=42), PARAM_GRID)

In [None]:
tup_linear_svc_standardised = make_pipeline(df_long, LinearSVC(random_state=42), PARAM_GRID, standardise=False)

In [None]:
print(len(tup_linear_svc[1]))
print(type(tup_linear_svc[0]))
print(tup_linear_svc[0].predict(['You are an absolute assohle. Hope your religion goes and dies!']))

In [None]:
tup_svc_standardised = make_pipeline(df_long, SVC(random_state=42), PARAM_GRID)


In [None]:
tup_svc = make_pipeline(df_long, SVC(random_state=42), PARAM_GRID, standardise=False)
print(tup_svc[0])
print(type(tup_svc[0][-1])) # tup_svc[0] is the best estimator, the [-1] accesses the kernel

############################## LIME ###################
Need explainer object, instance of test data = tuple_of_generators[1][whichever dataset performed best], instance of the classifier rf = pull from list tup_bots_forst[0][whichever dataset peformed best]

In [12]:
print(tup_svc[2][2334])
print(tup_svc[3][2334])

fitted_clf = tup_svc[0][-1]
print(fitted_clf)
devset_x, devset_y = tup_svc[2], tup_svc[3] #accessing examples from the dataset for LIME


def use_lime(clf, dev_x, dev_y):
    from lime.lime_text import LimeTextExplainer
    class_names = ['Hateful', 'Non-Hateful']
    explainer = LimeTextExplainer(class_names=class_names)
    idx = 2334
    exp = explainer.explain_instance(dev_x[idx], clf.predict, num_features=6)
    tweet = dev_y[idx]
    label = dev_y[idx]
    print('probability = ', clf.predict(dev_x[idx]))
    print('true class: %d ' % label)
    print('tweet: %s' % tweet)


NameError: name 'tup_svc' is not defined

In [None]:
use_lime(fitted_clf, devset_x, devset_y)

In [None]:
#Neural Network Classifier 



In [None]:
tup_bots_MLP = bots(tuple_of_generators[0], tuple_of_generators[1], MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10,30,10,5), random_state=42, batch_size=128, max_iter=1000))

In [None]:
#Support Vector Classifier without Gridsearch


#tup_bots_svc = bots(tuple_of_generators[0], tuple_of_generators[1], SVC())


In [None]:
#RandomForestClassifier without Gridsearch

from sklearn.ensemble import RandomForestClassifier


tup_bots_forest = bots(tuple_of_generators[0], tuple_of_generators[1], RandomForestClassifier(n_jobs=-1, random_state=42))

In [None]:
best_rf = best_clf(tup_bots_forest, 1) #best performing classifier
print(best_rf)

In [None]:
print(tup_bots_forest[0])

In [None]:
def hi(r):
    lst = [1, 2, 3, 4, 5, 6, 7]
    print(lst[0])
    new_lst = []
    for i in range(0, r):
        clf = lst.pop()
        new_lst.append(clf)
    return new_lst

In [None]:
print(hi(3))
print(hi(0))
print(hi(1))

In [None]:
#classification report
from sklearn.metrics import accuracy_score,recall_score,precision_score,recall_score,f1_score
import warnings
warnings.filterwarnings('ignore')

def report(prediction, real_labels, data):
    print("classification report as follows: ")
    print(f'   Accuracy: {accuracy_score(prediction, real_labels)}')
    print(f'   Precision: {precision_score(prediction, real_labels,average="macro")}')
    print(f'   recall: {recall_score(prediction, real_labels,average="macro")}')
    print(f'   F1 measure: {f1_score(prediction, real_labels,average="macro")}')
    print('Show 5 example of correctly classified datapoint: ')
    if data[prediction==real_labels].shape[0] > 5:
        display(data[prediction==real_labels].iloc[:5,:])
    else:
        display(data[prediction==real_labels])
    print('Show 5 example of wrongly classified datapoint: ')
    if data[prediction!=real_labels].shape[0] > 5:
        display(data[prediction!=real_labels].iloc[:5,:])
    else:
        display(data[prediction!=real_labels])

In [None]:
val_x = vectorizer.transform(validation_set['Tokens'])
val_x = transformer.transform(val_x)

predict = ntwk.predict(val_x)
report(predict, validation_set['label'], validation_set[['Tokens','Label']])