# Notebook for Kaggle Toxic Comments Prediction

This is a notebook for the kaggle toxic comments prediction competition:
    [lin kere](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

## Pre-processing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
train_df = pd.read_csv("train.csv", delimiter = ",")
test_df = pd.read_csv("test.csv", delimiter = ",")

In [None]:
train_df.head()

In [None]:
train_df.iloc[1, 1]

In [None]:
train_text.describe()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
test_df.head()

## Visualizing Data

In [None]:
toxic = train_df['toxic']

In [None]:
plt.hist(toxic)

# Get the Text

In [None]:
train_text = train_df['comment_text']
test_text = test_df['comment_text']

In [None]:
train_text.isnull().sum()

In [None]:
test_text.isnull().sum()

# Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1,3), \
                            stop_words="english", dtype=np.float32)

In [None]:
train_vec = vectorizer.fit_transform(train_text)

In [None]:
test_vec = vectorizer.transform(test_text)

In [None]:
train_vec.shape

In [None]:
test_vec.shape

## Prediction for Toxic or Not

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
toxic_Y_train = train_df['toxic']

In [None]:
logreg = LogisticRegression()
logreg.fit(train_vec, toxic_Y_train)

In [None]:
acc_log = round(logreg.score(train_vec, toxic_Y_train) * 100, 2)
acc_log

In [None]:
toxic_pred = logreg.predict_proba(test_vec)

# Get All the Target Classes

In [None]:
train_df.columns

In [None]:
targets = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']

In [None]:
# Train and test severe_toxic
severe_toxic_Y_train = train_df['severe_toxic']
logreg = LogisticRegression()
logreg.fit(train_vec, severe_toxic_Y_train)
acc_log = round(logreg.score(train_vec, severe_toxic_Y_train) * 100, 2)
acc_log

In [None]:
severe_toxic_pred = logreg.predict_proba(test_vec)

In [None]:
#targets = ['toxic', 'severe_toxic', 'obscene', 'threat',
#       'insult', 'identity_hate']
# Train and test obscene
obscene_Y_train = train_df['obscene']
logreg = LogisticRegression()
logreg.fit(train_vec, obscene_Y_train)
obscene_pred = logreg.predict_proba(test_vec)
acc_log = round(logreg.score(train_vec, obscene_Y_train) * 100, 2)
acc_log

In [None]:
#targets = ['toxic', 'severe_toxic', 'obscene', 'threat',
#       'insult', 'identity_hate']
# Train and test obscene
threat_Y_train = train_df['threat']
logreg = LogisticRegression()
logreg.fit(train_vec, threat_Y_train)
threat_pred = logreg.predict_proba(test_vec)
acc_log = round(logreg.score(train_vec, threat_Y_train) * 100, 2)
acc_log

In [None]:
#targets = ['toxic', 'severe_toxic', 'obscene', 'threat',
#       'insult', 'identity_hate']
# Train and test obscene
insult_Y_train = train_df['insult']
logreg = LogisticRegression()
logreg.fit(train_vec, insult_Y_train)
insult_pred = logreg.predict_proba(test_vec)
acc_log = round(logreg.score(train_vec, insult_Y_train) * 100, 2)
acc_log

In [None]:
#targets = ['toxic', 'severe_toxic', 'obscene', 'threat',
#       'insult', 'identity_hate']
# Train and test obscene
identity_hate_Y_train = train_df['identity_hate']
logreg = LogisticRegression()
logreg.fit(train_vec, identity_hate_Y_train)
identity_hate_pred = logreg.predict_proba(test_vec)
acc_log = round(logreg.score(train_vec, identity_hate_Y_train) * 100, 2)
acc_log

# Make Submission

In [None]:
submission_array = np.zeros((test_text.shape[0], len(targets)), dtype=np.float32)

In [None]:
submission_array.shape

In [None]:
toxic_pred[0:5]

In [None]:
submission_array[:,0] = toxic_pred[:,1]

In [None]:
submission_array[:,1] = severe_toxic_pred[:,1]
submission_array[:,2] = obscene_pred[:,1]
submission_array[:,3] = threat_pred[:,1]
submission_array[:,4] = insult_pred[:,1]
submission_array[:,5] = identity_hate_pred[:,1]

In [None]:
submission_df = pd.DataFrame(submission_array, columns=targets)

In [None]:
submission_df.head()

In [None]:
submission_df["id"] = test_df["id"]

In [None]:
submission_df.head()

In [None]:
new_cols = submission_df.columns.tolist()
new_cols

In [None]:
cols = new_cols[-1:] + new_cols[:-1] 
cols

In [None]:
submission_df = submission_df[cols]

In [None]:
submission_df.head()

In [None]:
output_file = "logisticregression.submission"

In [None]:
submission_df.to_csv(output_file, index=False)

# Make Predictions on Cleaned Data

In [4]:
train_df.shape

(159571, 8)

In [5]:
test_df.shape

(153164, 2)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 6007, analyzer="word", ngram_range=(1,3), \
                            stop_words="english", dtype=np.float32)

In [9]:
train_text = []
f_in = open("train_sentences.clean", "r")
while True:
    sent = f_in.readline()
    if sent == "":
        break
    train_text.append(sent)
len(train_text)

159571

In [10]:
test_text = []
f_in = open("test_sentences.clean", "r")
while True:
    sent = f_in.readline()
    if sent == "":
        break
    test_text.append(sent)
len(test_text)

153164

In [11]:
train_vec = vectorizer.fit_transform(train_text)
test_vec = vectorizer.transform(test_text)

In [12]:
train_vec.shape

(159571, 6007)

In [13]:
test_vec.shape

(153164, 6007)

In [14]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)

In [15]:
Y_train = train_df['toxic']
random_forest.fit(train_vec, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
acc_log = round(random_forest.score(train_vec, Y_train) * 100, 2)
acc_log

99.74

In [19]:
Y_pred = random_forest.predict_proba(test_vec)

In [20]:
Y_pred[0:5]

array([[0.08, 0.92],
       [0.99, 0.01],
       [1.  , 0.  ],
       [0.99, 0.01],
       [1.  , 0.  ]])

In [21]:
Y_train[0:5]

0    0
1    0
2    0
3    0
4    0
Name: toxic, dtype: int64

In [22]:
train_text[0]

'explanation why the edits made under my username hardcore metallica fan were reverted   they were nt vandalisms   just closure on some gas after i voted at new york dolls fac   and please do nt remove the template from the talk page since i am retired now NUM\n'

In [23]:
targets = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']
preds = np.zeros((test_vec.shape[0], len(targets)), dtype = np.float32)

for idx, target in enumerate(targets):
    Y_train = train_df[target]
    random_forest.fit(train_vec, Y_train)
    Y_pred = random_forest.predict_proba(test_vec)
    preds[:, idx] = Y_pred[:, 1]   

In [24]:
preds[0:5]

array([[0.92 , 0.235, 0.874, 0.06 , 0.77 , 0.17 ],
       [0.02 , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.01 , 0.   , 0.   , 0.01 , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.01 , 0.   ]], dtype=float32)

In [25]:
rf_submissions = pd.DataFrame(preds, columns = targets)

In [26]:
rf_submissions.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.92,0.235,0.874,0.06,0.77,0.17
1,0.02,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.01,0.0,0.0,0.01,0.0,0.0
4,0.0,0.0,0.0,0.0,0.01,0.0


In [27]:
rf_submissions["id"] = test_df["id"]

In [28]:
rf_submissions.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,id
0,0.92,0.235,0.874,0.06,0.77,0.17,00001cee341fdb12
1,0.02,0.0,0.0,0.0,0.0,0.0,0000247867823ef7
2,0.0,0.0,0.0,0.0,0.0,0.0,00013b17ad220c46
3,0.01,0.0,0.0,0.01,0.0,0.0,00017563c3f7919a
4,0.0,0.0,0.0,0.0,0.01,0.0,00017695ad8997eb


In [29]:
reordered_cols = ["id"] + targets

In [31]:
rf_submissions = rf_submissions[reordered_cols]

In [32]:
rf_submissions.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.92,0.235,0.874,0.06,0.77,0.17
1,0000247867823ef7,0.02,0.0,0.0,0.0,0.0,0.0
2,00013b17ad220c46,0.0,0.0,0.0,0.0,0.0,0.0
3,00017563c3f7919a,0.01,0.0,0.0,0.01,0.0,0.0
4,00017695ad8997eb,0.0,0.0,0.0,0.0,0.01,0.0


In [33]:
output_file = "random_forests.submission"
rf_submissions.to_csv(output_file, index=False)

# Logistic Regression on Cleaned data

In [34]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

targets = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']
logreg_preds = np.zeros((test_vec.shape[0], len(targets)), dtype = np.float32)

for idx, target in enumerate(targets):
    Y_train = train_df[target]
    logreg.fit(train_vec, Y_train)
    Y_pred = logreg.predict_proba(test_vec)
    logreg_preds[:, idx] = Y_pred[:, 1]   

In [35]:
logreg_submissions_2 = pd.DataFrame(logreg_preds, columns = targets)
logreg_submissions_2["id"] = test_df["id"]

In [36]:
output_file = "logreg_2.submission"
logreg_submissions_2.to_csv(output_file, index=False)