In [1]:
import os
import numpy as np
import pandas as pd
import scipy as sp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# read in data
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [3]:
# Since we want a vector representation of all words, we need to take both testing and training and tfidf them
all_comment_text = pd.concat([train_data.comment_text, test_data.comment_text])

In [4]:
# Initialize the tf-idf matrix from sklearn
word_vec = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), max_features=50000)
word_vec.fit(all_comment_text)
x_train_w = word_vec.transform(train_data.comment_text)
x_test_w = word_vec.transform(test_data.comment_text)

In [5]:
# fit a model for each column
model_t = LogisticRegression(C=1.2, class_weight='balanced')
model_t.fit(x_train_w, train_data['toxic'].values)
model_st = LogisticRegression(C=1.2, class_weight='balanced')
model_st.fit(x_train_w, train_data['severe_toxic'].values)
model_o = LogisticRegression(C=1.2, class_weight='balanced')
model_o.fit(x_train_w, train_data['obscene'].values)
model_thr = LogisticRegression(C=1.2, class_weight='balanced')
model_thr.fit(x_train_w, train_data['threat'].values)
model_i = LogisticRegression(C=1.2, class_weight='balanced')
model_i.fit(x_train_w, train_data['insult'].values)
model_ih = LogisticRegression(C=1.2, class_weight='balanced')
model_ih.fit(x_train_w, train_data['identity_hate'].values)

LogisticRegression(C=1.2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [23]:
# test example
lis = []
test_str = "can i help you with that."
str_vec = pd.Series(test_str)
trans = word_vec.transform(str_vec)

lis.append(model_t.predict_proba(trans)[0][1])
lis.append(model_st.predict_proba(trans)[0][1])
lis.append(model_o.predict_proba(trans)[0][1])
lis.append(model_thr.predict_proba(trans)[0][1])
lis.append(model_i.predict_proba(trans)[0][1])
lis.append(model_ih.predict_proba(trans)[0][1])
lis

[0.08283178054665061,
 0.0032088612014206834,
 0.02209843380769575,
 0.002153457348045817,
 0.04311302590212134,
 0.02558084294725086]