# Scripts for model training and testing
## This script is modified from replication for results in Davidson et al. 2017. "Automated Hate Speech Detection and the Problem of Offensive Language"
## original paper: https://aaai.org/ocs/index.php/ICWSM/ICWSM17/paper/view/15665
## original code: https://github.com/t-davidson/hate-speech-and-offensive-language

In [None]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

## Loading the data

In [None]:
df = pd.read_csv("../data/combined_data.csv")

In [None]:
df

In [None]:
df.describe()

In [None]:
df.columns

### Columns key:
class = class label for majority of CF users.

    0 - normal
    1 - bad

tweet = raw tweet text


In [None]:
df['class'].hist()

In [None]:
tweets=df.tweet

## Feature generation

In [None]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) remove mentions
    4) lots of ! with one instance
    5) substitute hashtag with HASHTAG
    6) remove 'RT'

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    punc_pattern = '!+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    retweet_pattern = 'RT'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(retweet_pattern, '', text_string) # remove RT
    parsed_text = re.sub(punc_pattern, '!', text_string)
    parsed_text = re.sub(giant_url_regex, 'URL', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text) # remove mention
    parsed_text = re.sub(hashtag_regex, 'HASHTAG', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]+", tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]+", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords,
    use_idf=True,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.75
    )

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores
print(stopwords)

In [None]:
#Now get other features
sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    # mention & hashtag are ignored
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return parsed_text.count('URLHERE')

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    # retweet is ignored
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [None]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", \
                        "vader compound", "num_urls"]

In [None]:
feats = get_feature_array(tweets)

In [None]:
#Now join them all up
M = np.concatenate([tfidf,feats],axis=1)

In [None]:
M.shape

In [None]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

feature_names = variables+other_features_names

# Running the model

The best model was selected using a GridSearch with 5-fold CV.

In [None]:
X = pd.DataFrame(M)
y = df['class'].astype(int)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.1)

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
my_model = 'rf'

def a(x):
    return {
        'dtc': DecisionTreeClassifier(),
        'lr': LogisticRegression(),
        'lsvc': LinearSVC(),
        'mlp': MLPClassifier(),
        'knn': KNeighborsClassifier(),
        'rf': RandomForestClassifier(),
        'gnb': GaussianNB(),
    }[x]

In [None]:
pipe = Pipeline(
        [ ('select', SelectFromModel(LogisticRegression(class_weight='balanced',C=0.01, penalty='l2'))),
        ('model', a(my_model))])

In [None]:
param_grid_dtc = [{
    'model__class_weight': ["balanced"],
    'model__criterion': ["gini"], # , "entropy"
    'model__splitter': ["best"], # , "random"
    'model__max_depth': [50], # None, 25 , 75, 100
}]

param_grid_lr = [{ 
    'model__class_weight': ["balanced"],
    'model__penalty': ['l1'], # 'l2', 
    'model__C': [0.05], # , 0.5, 0.1
    'model__solver': ['liblinear'], # , "lbfgs", 'sag', "newton-cg"
}]

param_grid_lsvc = [{
    'model__class_weight': ["balanced"],
    'model__penalty': ['l2'],
    'model__loss': ['hinge'], #'squared_hinge'
    'model__multi_class': ['ovr'], # 'crammer_singer'
    'model__C': [0.01], # 0.05, 0.1, 0.5, 1
    'model__max_iter': [2000],
}]

param_grid_cnb = [{
    'model__alpha': [0, 0.1, 0.5, 1, 1.5, 2],
}] # negative

param_grid_mlp = [{
    'model__alpha': [0.1], # 0.0001, 
    'model__activation': ['logistic'],# , 'tanh', 'relu' done
    'model__batch_size': [32], # 64, , 128
    'model__solver': ['sgd'], #'lbfgs','adam',  done
    'model__learning_rate': ['adaptive'],
    'model__max_iter': [2000],
}]

param_grid_knn = [{
    'model__n_neighbors': [5], # 1, 3, , 7 done
    'model__weights': ['distance'], # 'uniform', done
    'model__algorithm': ['ball_tree'], # 'auto', , 'kd_tree' done
    'model__p': [1], # , 2, 3, 4 done
    'model__n_jobs': [-1],
}]

param_empty = [{}]

def b(x):
    return {
        'dtc': param_grid_dtc,
        'lr': param_grid_lr,
        'lsvc': param_grid_lsvc,
        'mlp': param_grid_mlp,
        'knn': param_grid_knn,
        'rf': param_empty,
        'gnb': param_empty,
    }[x]

In [None]:
grid_search = GridSearchCV(pipe, 
                           b(my_model),
                           cv=StratifiedKFold(n_splits=5, shuffle=True).split(X_train, y_train), 
                           verbose=2,
                           )

In [None]:
model = grid_search.fit(X_train, y_train)

In [None]:
y_preds = model.predict(X_test)

## Evaluating the results

In [None]:
report = classification_report( y_test, y_preds, digits=5 )

***Note: Results in paper are from best model retrained on the entire dataset (see the other notebook). Here the results are reported after using cross-validation and only for the held-out set.***

In [None]:
import time
print(report)
print(grid_search.best_params_)
print('Best score:  ', grid_search.best_score_)
print('CV results:  ', grid_search.cv_results_)

In [None]:
# save result to file
f = open('final_output/lr+{}_grid_output.txt'.format(my_model),'a')
f.write('------------------------------------\n')
f.write(str(grid_search.best_params_) + '\n')
f.write(str(grid_search.cv_results_['params']) + '\n')
f.write(str(time.time()) + '\n')
f.write('------------------------------------\n')
f.write(report)
f.write('Best score:  ' + str(grid_search.best_score_))
f.write('------------------------------------\n')
f.close()

In [None]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix = confusion_matrix(y_test,y_preds)
# matrix_proportions = np.zeros((2,2))
# for i in range(0,2):
#     matrix_proportions[i,:] = confusion_matrix[i,:]/float(confusion_matrix[i,:].sum())
# names=['Normal','Bad']
# confusion_df = pd.DataFrame(matrix_proportions, index=names,columns=names)
# plt.figure(figsize=(5,5))
# seaborn.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cmap='gist_gray_r',cbar=False, square=True,fmt='.2f')
# plt.ylabel(r'True categories',fontsize=14)
# plt.xlabel(r'Predicted categories',fontsize=14)
# plt.tick_params(labelsize=12)

# #Uncomment line below if you want to save the output
# plt.savefig('final_output/lr+{}_grid_confusion'.format(my_model) + str(time.time()) + '.pdf')

In [None]:
#True distribution
y.hist()

In [None]:
pd.Series(y_preds).hist()