### Imports

In [25]:
import pandas as pd
import nltk, re, string
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import svm
from nltk.corpus import stopwords

from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
import itertools
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier

import re, collections
from nltk.corpus import words as w

### Preprocess Data

Read the csv file and make a dataframe.
- For training: Randomize and Divide it into 80:20 partitions

In [26]:
def load_Dataset(run="train"):
    df = pd.read_csv("train.csv")
    df = df[df["Comment"].notnull()]
    df.apply(np.random.permutation)
    if run=="train":
        df_train = df[:round(0.8*len(df))]
        df_test = df[round(0.8*len(df)):]
    elif run=="test":
        df_train = df
        df_test = pd.read_csv("test_with_solutions_2.csv")
    return df_train, df_test

### Tokenization

This function takes a text and does the following to return the tokens:
* Use nltk's TweetTokenizer to get tokens
* Use wordNetLemmatizer for lemmatization
* Use porterStemmer to stem the resulting tokens

In [27]:
def words(text):
    return re.findall('[a-z]+', text.lower())

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1   
    return model

with open("big.txt", "r") as big:
    word_corpus = big.read()
for word in w.words():
    word_corpus += word
    

NWORDS = train(words(word_corpus))
with open("list_of_abuses.txt", "r") as abuse_list:
    abuses = abuse_list.read().split()
    for abuse in abuses:
        NWORDS[abuse] = 100

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
#     print(word)
    s = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in s if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in s for c in alphabet if b]
    inserts    = [a + c + b     for a, b in s for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words):
    try:
        return [int(w) for w in words] #to take care of purely numeric words
    except:
        return set(w for w in words if w.lower() in NWORDS)

def correct(word):
    if word[0] not in alphabet: 
        return word
    else:
        word = re.sub(r'(.)\1+', r'\1\1', word)
        candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
        return max(candidates, key=NWORDS.get)

In [28]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w!@#$%^&*]+)', # To group symbols together
    r'(?:[\w_]+)', # other words
    
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    tokens = tokens_re.findall(s)
    for i in range(len(tokens)):
        clean_token = correct(tokens[i])
        tokens[i] = clean_token
    return tokens
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [29]:
def build_tokens(text):
#     tweetTokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
#     tokens = tweetTokenizer.tokenize(text)
    tokens = preprocess(text, lowercase=True)
    tokens = [nltk.WordNetLemmatizer().lemmatize(token) for token in tokens]
    tokens= [nltk.PorterStemmer().stem(token) for token in tokens]
    return tokens

In [32]:
build_tokens("Your a retard go post your head up your f%&k")

['your', 'a', 'retard', 'go', 'post', 'your', 'head', 'up', 'your', 'fuck']

### Feature Extraction

In [33]:
# tfidf feature extraction and chi2 selection
def feature_extraction(df_train, df_test):
    vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,3), max_df= 0.5, analyzer= "word", tokenizer= build_tokens ,min_df=10,max_features=10000) #current best for max_features = 4000   
#     count_vectorizer = CountVectorizer(analyzer="word", tokenizer=build_tokens, ngram_range=(1,3), max_features=1000)
    
    X_train = vectorizer.fit_transform(df_train["Comment"]).todense()
    X_test = vectorizer.transform(df_test["Comment"]).todense()

    ch2 = SelectKBest(chi2, k = 'all') #current best for k=2300(0.8815625)
    X_train = ch2.fit_transform(X_train, df_train.Comment)
    X_test = ch2.transform(X_test)
    
    ####### Debug run #######
    # feature_names = vectorizer.get_feature_names()
    # feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
    
    return X_train, X_test

### Classifier

In [34]:
# 
def classify_train(clf_type, X_train, train_category):
    if clf_type == "logreg":
#         logreg = linear_model.LogisticRegression(tol=1e-8, penalty='l2', C=4, max_iter=1000)
        logreg = linear_model.LogisticRegression(C=8.25, max_iter=3000, tol=1e-8)
        logreg.fit(X_train, train_category)
        return logreg
    elif clf_type == "svm_rbf":
        clf = svm.SVC(kernel='rbf', gamma=0.8, C=1, decision_function_shape="ovr", probability=True)
        clf.fit(X_train, train_category)
        return clf
    elif clf_type == "svm_linear":
        clf = svm.SVC(kernel = 'linear', probability = True)
        clf.fit(X_train, train_category)
        return clf
    elif clf_type == "sgd":
        clf = linear_model.SGDClassifier(n_iter=2000,loss = 'modified_huber', penalty = 'elasticnet', n_jobs=-1)
        clf.fit(X_train,train_category)
        return clf
    elif clf_type == "nb":
        clf = MultinomialNB()
        clf.fit(X_train,train_category)
        return clf
    elif clf_type == "nn":
        clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(500,150), max_iter=50000, random_state=1)
        clf.fit(X_train,train_category)
        return clf
    # ensemble of different classifiers. We used a soft voting measure to combine the output
    elif clf_type == "ensemble":
        clf1 = linear_model.LogisticRegression(C=8.25, max_iter=3000, tol=1e-8)
        clf3 = svm.SVC(kernel='rbf', gamma=0.8, C=1, decision_function_shape="ovr",probability=True)
        clf4 = linear_model.SGDClassifier(n_iter=2000,loss = 'modified_huber', penalty = 'elasticnet', n_jobs=-1)
        eclf = VotingClassifier(estimators=[('lr',clf1),('svm_rbf',clf3), ('sgd' , clf4)], voting="soft")
        eclf = eclf.fit(X_train,train_category)
        return eclf

##### This function takes a trained classifier and a set of features as input and returns the prediction of the classifier

In [35]:
def classify_predict(clf, X_test):
    predictions = clf.predict(X_test)
    return predictions

def check_val_score(predictions, true_vals):
    return metrics.accuracy_score(true_vals,predictions)

## Getting it all together

### Validation run

In [37]:
df_train, df_test = load_Dataset()
X_train, X_test = feature_extraction(df_train, df_test)

In [38]:
clf = classify_train("ensemble", X_train, df_train.Insult)

In [39]:
predictions = classify_predict(clf, X_test)
print(check_val_score(predictions, df_test.Insult))

0.842839036755


### Final run 

In [40]:
df_train, df_test = load_Dataset("test")
X_train, X_test = feature_extraction(df_train, df_test)

In [41]:
clf = classify_train("ensemble", X_train, df_train.Insult)

In [42]:
predictions = classify_predict(clf, X_test)
print(check_val_score(predictions, df_test.Insult))

0.608946608947
