In [2]:
#@title Importig library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action="ignore")

import time 

#nlp
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TreebankWordTokenizer
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union
from sklearn.linear_model import LogisticRegression


#viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image
import matplotlib_venn as venn

nltk.download('stopwords')
eng_stopwords = set(stopwords.words("english"))

#settings
start_time=time.time()
color = sns.color_palette()
sns.set_style("dark")
warnings.filterwarnings("ignore")
lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()
from IPython.display import clear_output
%matplotlib inline

#@title Install kaggle
!pip install kaggle

#@title upload kaggle api from your profile
from google.colab import files
files.upload()

#@title Modification to use kaggle dataset
!mkdir /root/.kaggle
!cp /content/kaggle.json /root/.kaggle/
!chmod 600 /content/kaggle.json

#@title Downloading the dataset
%cd /content
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
clear_output()

In [3]:
#@title Processing the dataset for taining

%%time
# Reading the dataset
train = pd.read_csv('train.csv.zip')
test = pd.read_csv('test.csv.zip')

# Cleaning the dataset
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

train['comment_text'] = train['comment_text'].apply(lambda x: clean_text(x))
test['comment_text']=test['comment_text'].apply(lambda x: clean_text(x))

# Preprocessing the features
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
targets = train[cols].values

train_df = train['comment_text']
test_df = test['comment_text']

# Word embedding 
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=30000)
vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=-1)

all_text = pd.concat([train,test])

vectorizer.fit(all_text)

train_features = vectorizer.transform(train_df)
test_features = vectorizer.transform(test_df)

CPU times: user 28.5 s, sys: 1.03 s, total: 29.5 s
Wall time: 3min 34s


In [4]:
%%time
# Model training and prediction
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})

for class_name in cols:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

CV score for class toxic is 0.8109710739009107
CV score for class severe_toxic is 0.907149498202974
CV score for class obscene is 0.8419683829677558
CV score for class threat is 0.8815992288557775
CV score for class insult is 0.8435638807392113
CV score for class identity_hate is 0.8307344891447396
CPU times: user 2min 16s, sys: 164 ms, total: 2min 16s
Wall time: 2min 16s


In [5]:
submission.head(2)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.183996,0.013963,0.13457,0.003096,0.095997,0.009027
1,0000247867823ef7,0.026833,0.001886,0.014006,0.000149,0.008276,0.004725


In [6]:
submission.to_csv('2_Tfidf_LogisticRegression.csv', index=False)

Score: 0.84644