# Mounting Google Drive
For this code to work, you need to store train, test, and sample_submission zip files from [here](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data) into your google drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/Jigsaw_NLP/
!ls

/content/drive/MyDrive/Jigsaw_NLP
sample_submission.csv.zip  test.csv.zip  train.csv.zip


In [None]:
!unzip sample_submission.csv.zip
!unzip test.csv.zip
!unzip train.csv.zip

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   
Archive:  test.csv.zip
  inflating: test.csv                
Archive:  train.csv.zip
  inflating: train.csv               


In [None]:
import shutil

In [None]:
shutil.move("sample_submission.csv", "/content/")
shutil.move("train.csv","/content/")
shutil.move("test.csv","/content/")

'/content/test.csv'

# ktrain Installation

In [None]:
!pip install ktrain

Collecting ktrain
[?25l  Downloading https://files.pythonhosted.org/packages/bb/41/d36714e51bf4e1d304f2ba80eb3c30c7eed69d72310d7f34fab86ed10b58/ktrain-0.26.4.tar.gz (25.3MB)
[K     |████████████████████████████████| 25.3MB 118kB/s 
[?25hCollecting scikit-learn==0.23.2
[?25l  Downloading https://files.pythonhosted.org/packages/f4/cb/64623369f348e9bfb29ff898a57ac7c91ed4921f228e9726546614d63ccb/scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 32.7MB/s 
Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 34.0MB/s 
Collecting cchardet
[?25l  Downloading https://files.pythonhosted.org/packages/80/72/a4fba7559978de00cf44081c548c5d294bf00ac7dcda2db405d2baa8c67a/cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263kB)
[K     |██████████████████████████

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline
import multiprocessing

In [None]:
%cd /
%cd content/
!ls

/
/content
drive  sample_data  sample_submission.csv  test.csv  train.csv


In [None]:
files=['test.csv',
       'train.csv',
       'sample_submission.csv'
      ]

def load_data(file):
    return pd.read_csv(file)
with multiprocessing.Pool() as pool:
    test,train,sub=pool.map(load_data,files)


# Preprocessing

In [None]:
train['target']=np.where(train['target']>0.7,1.0,0.0)

In [None]:
toxic_train=train[train['target']>0.7].iloc[0:45451,:]
neutral_train=train[train['target']<=0.7].iloc[0:150000,:]

balanced_train=pd.concat([toxic_train,neutral_train],axis=0)

In [None]:
X=balanced_train['comment_text']
Y=balanced_train['target']

In [None]:
balx_train,balx_test,baly_train,baly_test = train_test_split(X,Y,test_size=0.15)

In [None]:
df1=pd.DataFrame(balx_train)

In [None]:
df2=pd.DataFrame(baly_train)
df2

Unnamed: 0,target
13530,0.0
1331976,1.0
111773,0.0
146813,0.0
139815,0.0
...,...
146215,0.0
148340,0.0
79030,0.0
19361,0.0


In [None]:
df1['target']=df2

In [None]:
data_train=df1

In [None]:
dt1=pd.DataFrame(balx_test)
dt2=pd.DataFrame(baly_test)

dt1['target']=dt2
dt1

Unnamed: 0,comment_text,target
1644843,OMG with the amount of stuff coming at the Pre...,1.0
1358918,Throw his boney ass in jail and throw away the...,1.0
1429859,These are hilarious in their stupidity.,1.0
74957,Well if it's true that many of the illegal imm...,0.0
1688845,Nobody wants your sad sick religion. Fund it ...,1.0
...,...,...
7757,These are just the homeless who are entitled t...,0.0
147509,"When one door closes, another one opens. Cell4...",0.0
142582,reading your comment filled me w/joy OS. I co...,0.0
103979,The earnings reserve is not protected. Once th...,0.0


In [None]:
data_test=dt1

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import ktrain
from ktrain import text
import tensorflow as tf

# Splitting for Cross-Validation

In [None]:
(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train,
                                                                   text_column = 'comment_text',
                                                                   label_columns = 'target',
                                                                   val_df = data_test,
                                                                   maxlen = 250,
                                                                   preprocess_mode = 'bert')



['not_target', 'target']
         not_target  target
13530           1.0     0.0
1331976         0.0     1.0
111773          1.0     0.0
146813          1.0     0.0
139815          1.0     0.0
['not_target', 'target']
         not_target  target
1644843         0.0     1.0
1358918         0.0     1.0
1429859         0.0     1.0
74957           1.0     0.0
1688845         0.0     1.0
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


# BERT Training

In [None]:
model = text.text_classifier(name = 'bert',
                             train_data = (X_train, y_train),
                             preproc = preproc)

Is Multi-Label? False
maxlen is 250
done.


In [None]:
learner = ktrain.get_learner(model=model, train_data=(X_train, y_train),
                   val_data = (X_test, y_test),
                   batch_size = 16)

In [None]:
del dt1, dt2, df1, df2, train, toxic_train, neutral_train, balanced_train
test.drop('id',inplace=True,axis=1)
test

Unnamed: 0,comment_text
0,[ Integrity means that you pay your debts.]\n\...
1,This is malfeasance by the Administrator and t...
2,@Rmiller101 - Spoken like a true elitist. But ...
3,"Paul: Thank you for your kind words. I do, in..."
4,Sorry you missed high school. Eisenhower sent ...
...,...
97315,He should lose his job for promoting mis-infor...
97316,"""Thinning project is meant to lower fire dange..."
97317,I hope you millennials are happy that you put ...
97318,I'm thinking Kellyanne Conway (a.k.a. The Trum...


In [25]:
learner.fit_onecycle(lr = 0.01, epochs = 2)
# learner.fit(lr = 0.01, epochs = 5)



begin training using onecycle policy with max lr of 0.01...
Epoch 1/5
Epoch 2/5
  154/10384 [..............................] - ETA: 3:05:08 - loss: 0.9724 - accuracy: 0.7045

KeyboardInterrupt: ignored

In [27]:
predictor = ktrain.get_predictor(learner.model, preproc)

# Generating Prediction

In [28]:
# prediction
ans=[]

for r in range(len(test)):
  ans.append(max(predictor.predict_proba(test['comment_text'][r])))
  
  # print(temtest['comment_text'][r])
sub['prediction']=pd.DataFrame(ans)

In [29]:
sub.to_csv('submission.csv',index=False)