In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from gensim.utils import tokenize
from nltk.stem import WordNetLemmatizer



In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

In [3]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [4]:
all_text.to_csv('comments.csv', index=False)

In [5]:
def tokenizer(text):
    tokens = tokenize(text, lowercase=True)
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    return lemmas

In [6]:
char_vectorizer = TfidfVectorizer(ngram_range=(2, 5), analyzer='char', max_features=50000, norm='l2', smooth_idf=False, sublinear_tf=True)
word_vectorizer = TfidfVectorizer(ngram_range=(1, 1), analyzer='word', tokenizer=tokenizer, max_features=10000, norm='l2', smooth_idf=False, sublinear_tf=True)

In [7]:
word_vectorizer.fit(all_text)
char_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [8]:
train_features = hstack([train_word_features, train_char_features])
test_features = hstack([test_word_features, test_char_features])

In [9]:
classifier = LogisticRegression(solver='lbfgs', n_jobs=-1)
scores = []
for class_name in class_names:
    train_target = train[class_name]
    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, scoring='roc_auc'))
    print('CV score for class {} is {}.'.format(class_name, cv_score))
    scores.append(cv_score)
print('Total score is {}.'.format(np.mean(scores)))

CV score for class toxic is 0.979017072468237.
CV score for class severe_toxic is 0.9883692825299794.
CV score for class obscene is 0.9903241420512799.
CV score for class threat is 0.9894512837997489.
CV score for class insult is 0.9831859430552653.
CV score for class identity_hate is 0.9830726254596222.
Total score is 0.9855700582273554.


In [10]:
submission = pd.DataFrame.from_dict({'id': test['id']})

In [11]:
for class_name in class_names:
    classifier.fit(train_features, train[class_name])
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]    

In [12]:
submission.to_csv('submission_best.csv', index=False)

Most common words

In [13]:
import re
import csv
import string
from collections import Counter
from nltk.corpus import stopwords

In [14]:
with open('comments.csv', 'r', encoding='utf-8') as ifile:
    comments = [" ".join(row)+'\n' for row in csv.reader(ifile)]

In [15]:
s_com = ' '.join(comments)

In [16]:
def transformer(text):
    text = text.lower()
    translator = str.maketrans('', '', string.punctuation)
    text_no_punct = text.translate(translator)
    final = re.compile('\s+').split(text_no_punct)
    return final

In [17]:
words = transformer(s_com)

In [18]:
with_stopwords = Counter(words)

In [19]:
sorted(with_stopwords, key=with_stopwords.get, reverse=True)[0:15] #top-15

['the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on']

In [20]:
stopwords_ = stopwords.words('english')

In [21]:
words_nostop = [word for word in words if word not in stopwords_]

In [22]:
no_stopwords = Counter(words_nostop)

In [23]:
sorted(no_stopwords, key=no_stopwords.get, reverse=True)[0:15] #top-15 (excluding stopwords)

['article',
 'page',
 'wikipedia',
 'would',
 'like',
 'one',
 'please',
 'dont',
 'talk',
 'see',
 'think',
 'im',
 'also',
 'know',
 'fuck']