In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
pd.options.display.max_colwidth = -1
pd.options.display.max_columns = 10

# Import Data and EDA

In [None]:
train_data = pd.read_csv('../input/train.csv')
print(train_data.shape)
print(train_data.head())

In [None]:
eda = lambda x : print('There are {} ({:.4f}%) {} comments'.format(train_data[x].sum(), 100*train_data[x].sum()/len(train_data), x))
for x in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    eda(x)

In [None]:
test_data = pd.read_csv('../input/test.csv')
print(test_data.shape)
print(test_data.head())

# Preprocessing

In [None]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
remove_punc = lambda x : re.sub(r"[^A-Za-z']", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

stop_words = set(nltk.corpus.stopwords.words('english'))
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

In [None]:
train_data['comment_text'] = train_data['comment_text'].apply(remove_punc)
print('Train: punctuation removed')
train_data['comment_text'] = train_data['comment_text'].apply(remove_extra_spaces)
print('Train: extra spaces removed')
train_data['comment_text'] = train_data['comment_text'].apply(lower_case)
print('Train: lower cased')
train_data['comment_text'] = train_data['comment_text'].apply(tag_pos)
print('Train: tagged')

test_data['comment_text'] = test_data['comment_text'].apply(remove_punc)
print('Train: punctuation removed')
test_data['comment_text'] = test_data['comment_text'].apply(remove_extra_spaces)
print('Train: extra spaces removed')
test_data['comment_text'] = test_data['comment_text'].apply(lower_case)
print('Train: lower cased')
test_data['comment_text'] = test_data['comment_text'].apply(tag_pos)
print('Train: tagged')

all_text = pd.concat([train_data['comment_text'], test_data['comment_text']])

# Vectorizing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(all_text)
print('Vectorizer: Fitted')
train_vector = vectorizer.transform(train_data['comment_text'])
print('Train: Vectorized')
test_vector =  vectorizer.transform(test_data['comment_text'])
print('Test: Vectorized')

# Model Fitting

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [None]:
scores = []
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame.from_dict({'id': test_data['id']})
for class_name in class_names:
    train_target = train_data[class_name]
    classifier = LogisticRegression()

    cv_score = np.mean(cross_val_score(classifier, train_vector, train_target, cv=3))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_vector, train_target)
    submission[class_name] = classifier.predict_proba(test_vector)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission.csv', index=False)