In [1]:
# plot graphics in the notebook 
%matplotlib inline
# support operations for large, multi-dimensional arrays and matrices
import numpy as np
# make experiments reproducible
np.random.seed(12345)
# extension of main plotting library matplotlib
import seaborn as sns
# main library for plotting
import matplotlib.pyplot as plt
# set style
plt.style.use("fivethirtyeight")
# set default size of plots
plt.rcParams['figure.figsize'] = 20, 16
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score #accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegressionCV
from scipy.sparse import hstack

In [3]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

In [73]:
one_string.value_counts().head()

the    902050
to     532403
of     406605
a      399980
and    397026
dtype: int64

In [61]:
one_string = pd.Series(all_text.str.cat(sep=' ').lower().split())

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [27]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])
#classifier = LogisticRegression(C=0.7, class_weight='balanced')

In [7]:
word_vectorizer = TfidfVectorizer(analyzer='word', # token = word
                    ngram_range=(1,1), # only unigrams are used, (1,2) - unigrams/bigrams, ..., etc.
                    stop_words='english', # or stop_words='english'
                    max_df=1.0, # don't filter words by their frequency
                    smooth_idf=True,
                    norm='l2' # euclidean norm is used by default
                    )
word_vectorizer.fit_transform(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [8]:
submission = pd.DataFrame.from_dict({'id': test['id']})

In [20]:
%%time
scores= []

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.7, class_weight='balanced', random_state=17)
    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, scoring='roc_auc', cv=5))
    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)
    classifier.fit(train_word_features, train_target)
    submission[class_name] = classifier.predict_proba(test_word_features)[:, 1]

print('Total score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.969080509024357
CV score for class severe_toxic is 0.9846888720891025
CV score for class obscene is 0.9852715139314425
CV score for class threat is 0.9820006895272719
CV score for class insult is 0.9764901132274323
CV score for class identity_hate is 0.9748836616250214
Total score is 0.978735893237438
CPU times: user 1min 36s, sys: 1.79 s, total: 1min 38s
Wall time: 1min 38s


In [19]:
submission.to_csv('submission.csv', index=False)