### Imports

In [1]:
import pandas as pd
import nltk, re, string
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import scipy.sparse as sp
from sklearn import svm
from nltk.corpus import stopwords

from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
import itertools
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier

### Preprocess Data

Read the csv file and make a dataframe.
- For training: Randomize and Divide it into 80:20 partitions

In [2]:
def load_Dataset(run="train"):
    df = pd.read_csv("train.csv")
    df = df[df["Comment"].notnull()]
    df.apply(np.random.permutation)
    if run=="train":
        df_train = df[:round(0.8*len(df))]
        df_test = df[round(0.8*len(df)):]
    elif run=="test":
        df_train = df
        df_test = pd.read_csv("impermium_verification_labels.csv")
    return df_train, df_test

### Tokenization

This function takes a text and does the following to return the tokens:
* Use nltk's TweetTokenizer to get tokens
* Use wordNetLemmatizer for lemmatization
* Use porterStemmer to stem the resulting tokens

In [3]:
def build_tokens(text):
    tweetTokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = tweetTokenizer.tokenize(text)
    tokens = [nltk.WordNetLemmatizer().lemmatize(token) for token in tokens]
    tokens= [nltk.PorterStemmer().stem(token) for token in tokens]
    return tokens

### Feature Extraction

In [4]:
#fn to load the bad words file which will be used to normalize the text
def loadBW():
    f = open("badwords.txt", "r")
    bwMap = dict()
    for line in f:
        sp = line.strip().lower().split(",")
        if len(sp) == 2:
            bwMap[sp[0].strip()] = sp[1].strip()
    return bwMap

#fn to preprocess and normalize the text
def normalize(f):
    f = [x.lower() for x in f]
    f = [x.replace("\\n"," ") for x in f]
    f = [x.replace("\\t"," ") for x in f]
    f = [x.replace("\\xa0"," ") for x in f]
    f = [x.replace("\\xc2"," ") for x in f]

    #f = [x.replace(","," ").replace("."," ").replace(" ", "  ") for x in f]
    #f = [re.subn(" ([a-z]) ","\\1", x)[0] for x in f]
    #f = [x.replace("  "," ") for x in f]

    f = [x.replace(" u "," you ") for x in f]
    f = [x.replace(" em "," them ") for x in f]
    f = [x.replace(" da "," the ") for x in f]
    f = [x.replace(" yo "," you ") for x in f]
    f = [x.replace(" ur "," you ") for x in f]
    #f = [x.replace(" ur "," your ") for x in f]
    #f = [x.replace(" ur "," you're ") for x in f]

    f = [x.replace("won't", "will not") for x in f]
    f = [x.replace("can't", "cannot") for x in f]
    f = [x.replace("i'm", "i am") for x in f]
    f = [x.replace(" im ", " i am ") for x in f]
    f = [x.replace("ain't", "is not") for x in f]
    f = [x.replace("'ll", " will") for x in f]
    f = [x.replace("'t", " not") for x in f]
    f = [x.replace("'ve", " have") for x in f]
    f = [x.replace("'s", " is") for x in f]
    f = [x.replace("'re", " are") for x in f]
    f = [x.replace("'d", " would") for x in f]

    #f = [x.replace("outta", "out of") for x in f]

    bwMap = loadBW()
    for key, value in bwMap.items():
        kpad = " " + key + " "
        vpad = " " + value + " "
        f = [x.replace(kpad, vpad) for x in f]

    # stemming
    f = [re.subn("ies( |$)", "y ", x)[0].strip() for x in f]
    #f = [re.subn("([abcdefghijklmnopqrstuvwxyz])s( |$)", "\\1 ", x)[0].strip() for x in f]
    f = [re.subn("s( |$)", " ", x)[0].strip() for x in f]
    f = [re.subn("ing( |$)", " ", x)[0].strip() for x in f]
    f = [x.replace("tard ", " ") for x in f]

    f = [re.subn(" [*$%&#@][*$%&#@]+"," xexp ", x)[0].strip() for x in f]
    f = [re.subn(" [0-9]+ "," DD ", x)[0].strip() for x in f]
    f = [re.subn("<\S*>","", x)[0].strip() for x in f]
    return f

In [5]:
# create tfidf vectors with ch2 selection
def ngrams(train, test, mn=1, mx=1, nm=500, analyzer_char=False, do_norm=False):
    if do_norm:
        train = normalize(train)

    analyzer_type = 'word'
    if analyzer_char:
        analyzer_type = 'char'
    
    vectorizer = TfidfVectorizer(ngram_range=(mx,mn),sublinear_tf=True,analyzer=analyzer_type)

    X_train = vectorizer.fit_transform(train)
    X_test = vectorizer.transform(test)

    if nm < X_train.shape[1]:
        ch2 = SelectKBest(chi2, k=nm)
        X_train = ch2.fit_transform(X_train, train)
        X_test = ch2.transform(X_test)

    return X_train, X_test

# ngrams for the word following "you are"
def specialCases(train, test):
    g = [x.lower().replace("you are"," SSS ").replace("you're"," SSS ").replace(" ur ", " SSS ").split("SSS")[1:] for x in train]

    f = []
    for x in g:
        fts = " "
        x = normalize(x)
        for y in x:
            w = y.strip().replace("?",".").split(".")
            fts = fts + " " + w[0]
        f.append(fts)

    X_train, X_test = ngrams(train, test, 1, 1, 100, do_norm=True)
    return X_train, X_test


In [6]:
# tfidf feature extraction and chi2 selection
def feature_extraction(df_train, df_test):
    #vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,3), max_df= 0.5, analyzer= "word", tokenizer= build_tokens ,min_df=10,max_features=10000) #current best for max_features = 4000   

    X_train1, X_test1 = ngrams(df_train["Comment"], df_test["Comment"], 1, 1, 2000)
    X_train2, X_test2 = ngrams(df_train["Comment"], df_test["Comment"], 2, 2, 4000)
    X_train3, X_test3 = ngrams(df_train["Comment"], df_test["Comment"], 3, 3, 100)
    X_train4, X_test4 = ngrams(df_train["Comment"], df_test["Comment"], 4, 4, 1000, analyzer_char = True)
    X_train5, X_test5 = ngrams(df_train["Comment"], df_test["Comment"], 5, 5, 1000, analyzer_char = True)
    X_train6, X_test6 = ngrams(df_train["Comment"], df_test["Comment"], 3, 3, 2000, analyzer_char = True)

    X_train7, X_test7 = specialCases(df_train["Comment"], df_test["Comment"])

    X_train = sp.hstack([X_train1, X_train2, X_train3, X_train4, X_train5, X_train6, X_train7])
    X_test = sp.hstack([X_test1,  X_test2,  X_test3, X_test4, X_test5, X_test6, X_test7])
    
    return X_train, X_test

### Classifier

In [10]:
def classify_train(clf_type, X_train, train_category):
    if clf_type == "logreg":
        #logreg = linear_model.LogisticRegression(tol=1e-8, penalty='l2', C=4, max_iter=1000)
        logreg = linear_model.LogisticRegression(C=8.25, max_iter=3000, tol=1e-8)
        logreg.fit(X_train, train_category)
        return logreg
    elif clf_type == "svm_rbf":
        clf = svm.SVC(kernel='rbf', gamma=0.8, C=1, decision_function_shape="ovr", probability=True)
        clf.fit(X_train, train_category)
        return clf
    elif clf_type == "svm_linear":
        clf = svm.SVC(kernel = 'linear', probability = True)
        clf.fit(X_train, train_category)
        return clf
    elif clf_type == "sgd":
        clf = linear_model.SGDClassifier(n_iter=2000,loss = 'modified_huber', penalty = 'elasticnet', n_jobs=-1)
        clf.fit(X_train,train_category)
        return clf
    elif clf_type == "nb":
        clf = MultinomialNB()
        clf.fit(X_train,train_category)
        return clf
    elif clf_type == "nn":
        clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(500,150), max_iter=50000, random_state=1)
        clf.fit(X_train,train_category)
        return clf
    # ensemble of different classifiers. We used a soft voting measure to combine the output
    elif clf_type == "ensemble":
        clf1 = linear_model.LogisticRegression(C=3)
        clf3 = svm.SVC(C=0.3,kernel='linear',probability=True)
        #clf4 = linear_model.SGDClassifier(n_iter=2000,loss = 'modified_huber', penalty = 'elasticnet', n_jobs=-1)
        eclf = VotingClassifier(estimators=[('lr',clf1),('svm_rbf',clf3)], voting="soft")
        eclf = eclf.fit(X_train,train_category)
        return eclf

##### This function takes a trained classifier and a set of features as input and returns the prediction of the classifier

In [11]:
def classify_predict(clf, X_test):
    predictions = clf.predict(X_test)
    return predictions

def check_val_score(predictions, true_vals):
    return metrics.accuracy_score(true_vals,predictions)

## Getting it all together

### Validation run

In [12]:
df_train, df_test = load_Dataset()
X_train, X_test = feature_extraction(df_train, df_test)
clf = classify_train("ensemble", X_train, df_train.Insult)

In [None]:
predictions = classify_predict(clf, X_test)
print(check_val_score(predictions, df_test.Insult))

### Final run 

In [None]:
df_train, df_test = load_Dataset("test")
X_train, X_test = feature_extraction(df_train, df_test)

In [None]:
clf = classify_train("ensemble", X_train, df_train.Insult)


In [None]:
predictions = classify_predict(clf, X_test)
print(check_val_score(predictions, df_test.Insult))