In [18]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import   TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

classes=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
df=pd.read_csv('./Data/train.csv')

train, test = train_test_split(df, random_state=4, test_size=0.25, shuffle=True)
summary=""

In [19]:
X_train = train.comment_text
X_test = test.comment_text
summary+="Train Samples: {}\nTest Samples: {}\n\n".format(len(X_train),len(X_test))
test.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
92960,f88f17ce73d55ad9,"Mark Teixiera\nPer WP:CRYSTAL, we only change ...",0,0,0,0,0,0
39944,6a9f4af6d51fc9e2,"""\n\n Reblock \n\nReblock that IP you unblocke...",0,0,0,0,0,0
73371,c44f6d09834d1058,"""::::Censorship isn't an answer. Some things s...",0,0,0,0,0,0


In [20]:
clf_multinomial_bayes = Pipeline([
    ('vect', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)),
        ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))),
 ],verbose=True)
clf_decision_tree = Pipeline([
    ('vect', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)),
        ('clf', OneVsRestClassifier(DecisionTreeClassifier(max_depth=4))),
 ],verbose=True)

clf_svm = Pipeline([
    ('vect', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)),
        ('clf', OneVsRestClassifier(SVC(gamma='auto',kernel='sigmoid'))),
 ],verbose=True)

classifiers={'Multinomial Naive Bayes':clf_multinomial_bayes,'Support Vector Machine':clf_svm,'Decision Tree':clf_decision_tree}
classifiers

{'Multinomial Naive Bayes': Pipeline(memory=None,
          steps=[('vect',
                  TfidfVectorizer(analyzer='word', binary=False,
                                  decode_error='strict',
                                  dtype=<class 'numpy.float64'>,
                                  encoding='utf-8', input='content',
                                  lowercase=True, max_df=1.0, max_features=None,
                                  min_df=1, ngram_range=(1, 1), norm='l2',
                                  preprocessor=None, smooth_idf=True,
                                  stop_words=frozenset({'a', 'about', 'above',
                                                        'across', 'after',
                                                        'afterwards...
                                                        'amongst', 'amoungst',
                                                        'amount', 'an', 'and',
                                                        'an

In [21]:
def get_overall_accuracy(test_df):
    all_corrects=0
    for row in range(len(test_df)):
        current_row=test_df.iloc[row]
        wrong=False
        for label in classes:
            if current_row[label] != current_row['predicted_'+label]:
                wrong=True
                break
        if not wrong:
            all_corrects+=1
    return all_corrects/len(test)

In [None]:
for each_clf in classifiers:
    print("For classifier {}".format(each_clf))
    copy_test=test.copy()
    overall_acc=0.0
    for each_class in classes:
        clf=classifiers[each_clf]
        clf.fit(X_train,train[each_class])
        predicted = clf.predict(X_test)
        
        copy_test['predicted_'+each_class]=predicted
        
        ind_cor_rate=np.mean(predicted == test[each_class])
        overall_acc+=ind_cor_rate
        summary+="Rate of '{}' classifier for class '{}' is {}\n".format(each_clf,each_class,ind_cor_rate)
        print("Accuracy for class '{}' is: {}".format(each_class,ind_cor_rate))
    
    overall_acc/=len(classes)
    summary+='\nOverall class accuracy is {}\n'.format(overall_acc)
    summary+='Overall all class accuracy for classifier {} is {}\n\n'.format(each_clf,get_overall_accuracy(copy_test))

For classifier Multinomial Naive Bayes
[Pipeline] .............. (step 1 of 2) Processing vect, total=   5.6s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
Accuracy for class 'toxic' is: 0.9215401198205199
[Pipeline] .............. (step 1 of 2) Processing vect, total=   5.9s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
Accuracy for class 'severe_toxic' is: 0.9902739829042689
[Pipeline] .............. (step 1 of 2) Processing vect, total=   5.6s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
Accuracy for class 'obscene' is: 0.9517960544456421
[Pipeline] .............. (step 1 of 2) Processing vect, total=   6.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
Accuracy for class 'threat' is: 0.9973930263454741
[Pipeline] .............. (step 1 of 2) Processing vect, total=   6.3s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
Accuracy for class 'insult' is: 0.9530744

In [None]:
#save txt
summary_out = open("summary.txt", "w")
summary_out.write(summary)
summary_out.close()