### Imports

In [1]:
import pandas as pd
import nltk, re, string
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import svm
from nltk.corpus import stopwords

from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
import itertools
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier

### Preprocess Data

Read the csv file and make a dataframe.
- For training: Randomize and Divide it into 80:20 partitions

In [12]:
def load_Dataset(run="train"):
    df = pd.read_csv("train.csv")
    df = df[df["Comment"].notnull()]
    df.apply(np.random.permutation)
    if run=="train":
        df_train = df[:round(0.8*len(df))]
        df_test = df[round(0.8*len(df)):]
    elif run=="test":
        df_train = df
        df_test = pd.read_csv("test_with_solutions_2.csv")
    return df_train, df_test

### Tokenization

This function takes a text and does the following to return the tokens:
* Use nltk's TweetTokenizer to get tokens
* Use wordNetLemmatizer for lemmatization
* Use porterStemmer to stem the resulting tokens

In [3]:
def build_tokens(text):
    tweetTokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = tweetTokenizer.tokenize(text)
    tokens = [nltk.WordNetLemmatizer().lemmatize(token) for token in tokens]
    tokens= [nltk.PorterStemmer().stem(token) for token in tokens]
    return tokens

### Feature Extraction

In [4]:
# tfidf feature extraction and chi2 selection
def feature_extraction(df_train, df_test):
    vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,3), max_df= 0.5, analyzer= "word", tokenizer= build_tokens ,min_df=10,max_features=10000) #current best for max_features = 4000   
#     count_vectorizer = CountVectorizer(analyzer="word", tokenizer=build_tokens, ngram_range=(1,3), max_features=1000)
    
    X_train = vectorizer.fit_transform(df_train["Comment"]).todense()
    X_test = vectorizer.transform(df_test["Comment"]).todense()

    ch2 = SelectKBest(chi2, k = 'all') #current best for k=2300(0.8815625)
    X_train = ch2.fit_transform(X_train, df_train.Comment)
    X_test = ch2.transform(X_test)
    
    ####### Debug run #######
    # feature_names = vectorizer.get_feature_names()
    # feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
    
    return X_train, X_test

### Classifier

In [5]:
# 
def classify_train(clf_type, X_train, train_category):
    if clf_type == "logreg":
#         logreg = linear_model.LogisticRegression(tol=1e-8, penalty='l2', C=4, max_iter=1000)
        logreg = linear_model.LogisticRegression(C=8.25, max_iter=3000, tol=1e-8)
        logreg.fit(X_train, train_category)
        return logreg
    elif clf_type == "svm_rbf":
        clf = svm.SVC(kernel='rbf', gamma=0.8, C=1, decision_function_shape="ovr", probability=True)
        clf.fit(X_train, train_category)
        return clf
    elif clf_type == "svm_linear":
        clf = svm.SVC(kernel = 'linear', probability = True)
        clf.fit(X_train, train_category)
        return clf
    elif clf_type == "sgd":
        clf = linear_model.SGDClassifier(n_iter=2000,loss = 'modified_huber', penalty = 'elasticnet', n_jobs=-1)
        clf.fit(X_train,train_category)
        return clf
    elif clf_type == "nb":
        clf = MultinomialNB()
        clf.fit(X_train,train_category)
        return clf
    elif clf_type == "nn":
        clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(500,150), max_iter=50000, random_state=1)
        clf.fit(X_train,train_category)
        return clf
    # ensemble of different classifiers. We used a soft voting measure to combine the output
    elif clf_type == "ensemble":
        clf1 = linear_model.LogisticRegression(C=8.25, max_iter=3000, tol=1e-8)
        clf3 = svm.SVC(kernel='rbf', gamma=0.8, C=1, decision_function_shape="ovr",probability=True)
        clf4 = linear_model.SGDClassifier(n_iter=2000,loss = 'modified_huber', penalty = 'elasticnet', n_jobs=-1)
        eclf = VotingClassifier(estimators=[('lr',clf1),('svm_rbf',clf3), ('sgd' , clf4)], voting="soft")
        eclf = eclf.fit(X_train,train_category)
        return eclf

##### This function takes a trained classifier and a set of features as input and returns the prediction of the classifier

In [6]:
def classify_predict(clf, X_test):
    predictions = clf.predict(X_test)
    return predictions

def check_val_score(predictions, true_vals):
    return metrics.accuracy_score(true_vals,predictions)

## Getting it all together

### Validation run

In [7]:
df_train, df_test = load_Dataset()
X_train, X_test = feature_extraction(df_train, df_test)
clf = classify_train("ensemble", X_train, df_train.Insult)

In [8]:
predictions = classify_predict(clf, X_test)
print(check_val_score(predictions, df_test.Insult))

0.850443599493


### Final run 

In [14]:
df_train, df_test = load_Dataset("test")
X_train, X_test = feature_extraction(df_train, df_test)

In [15]:
clf = classify_train("ensemble", X_train, df_train.Insult)


0.611832611833


In [16]:
predictions = classify_predict(clf, X_test)
print(check_val_score(predictions, df_test.Insult))

0.611832611833
