In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import numpy as np
import pandas as pd
import time

import progressbar

In [49]:
train = pd.read_csv('./preprocessed/train.csv')
test = pd.read_csv('./preprocessed/test.csv')
test_labels = pd.read_csv('./dataset/test_labels.csv')

test = test.fillna('')

print(train.shape)
print(test.shape)
print(test_labels.shape)

(159571, 10)
(153164, 4)
(153164, 7)


In [7]:
def clf(**kwargs):
    return LogisticRegression(C=2, solver='lbfgs', **kwargs)

In [14]:
def training(X, classifier, **kwargs):
    test_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    roc_aoc_scores = []
    clfs = []
    
    for eval_col in test_cols:
#         print("FIT ", eval_col)
        y = train[eval_col]
        X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.3, random_state=101)

        clf = classifier(**kwargs).fit(X_train, y_train)
        predicted= clf.predict_proba(X_test)[:,1]
            
        roc_aoc_scores.append(metrics.roc_auc_score(y_test, predicted))
        clfs.append(clf)
    
    avg_score = np.mean(np.array(roc_aoc_scores))
    for index, score in enumerate(roc_aoc_scores) :
        print("Score for {:<10} is {:<10}".format(test_cols[index], score))
    print("Score :" , avg_score)
    return clfs, roc_aoc_scores

In [3]:
token_regex = r'[a-zA-Z0-9]+'
tokenizer = RegexpTokenizer(token_regex).tokenize

In [4]:
tfidf = TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range = (1,2), tokenizer = tokenizer, max_features=100000)

In [5]:
text = tfidf.fit_transform(train['comment_stem'])

In [6]:
num_features = len(tfidf.get_feature_names())
print("Num Features : ", num_features)

Num Features :  100000


In [15]:
trained_clfs, scores = training(text, clf, max_iter=600)

Score for toxic      is 0.975133898127067
Score for severe_toxic is 0.9848534035920431
Score for obscene    is 0.9850845071108659
Score for threat     is 0.9873402246798102
Score for insult     is 0.9798890140128306
Score for identity_hate is 0.9769750074770984
Score : 0.9815460091666193


In [52]:
cc = trained_clfs[0]

In [53]:
selection = test_labels['toxic'] != -1

In [54]:
test_input = tfidf.transform(test[selection]['comment_stem'])

In [58]:
aa = cc.predict(test_input)

In [57]:
test_labels[selection]['toxic']

5         0
7         0
11        0
13        0
14        0
         ..
153150    0
153151    0
153154    0
153155    1
153156    0
Name: toxic, Length: 63978, dtype: int64

In [59]:
metrics.accuracy_score(aa, test_labels[selection]['toxic'])

0.9377442245771984

In [63]:
test_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
res = 0
for index,cccc in enumerate(trained_clfs):
    aa = cccc.predict(test_input)
    acc = metrics.accuracy_score(aa, test_labels[selection][test_cols[index]])
    res += acc
    print(test_cols[index], acc)

print("Result : ", res/len(test_cols))
    

toxic 0.9377442245771984
severe_toxic 0.9931695270249148
obscene 0.9672231079433555
threat 0.9969677076495045
insult 0.9646440964081403
identity_hate 0.9905123636249961
Result :  0.9750435045380182


In [64]:
save_dic = {
    "models" : {
        "toxic" : trained_clfs[0],
        'severe_toxic' : trained_clfs[1], 
        'obscene' : trained_clfs[2], 
        'threat' : trained_clfs[3], 
        'insult' : trained_clfs[4], 
        'identity_hate' : trained_clfs[5]
    },
    "transform" : tfidf
}

In [65]:
save_dic

{'models': {'toxic': LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=600,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False),
  'severe_toxic': LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=600,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False),
  'obscene': LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=600,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, 

In [66]:
import pickle

In [67]:
with open("./pickles/model.pickle", 'wb') as file:
    pickle.dump(save_dic, file)