In [14]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [15]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [16]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text


from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [17]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,max_features = 5000)

In [19]:
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [clean_text],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='roc_auc',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [20]:
submission_hyper_parm_logi = pd.read_csv("../input/sample_submission.csv")
target = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [21]:
for label in target:
    print('')
    print(".............. Started model fitting for "+ label)
    gs_lr_tfidf.fit(train_df.comment_text.values, train_df[label].values)
    print(".............. Completed model fitting for "+ label)
    print('')
    print("Best paramters for "+label)
    gs_lr_tfidf.best_params_
    print('')
    test_y_prob = gs_lr_tfidf.predict_proba(test_df.comment_text)[:,1]
    submission_hyper_parm_logi[label] = test_y_prob
    print(".............. Completed model Prediction for "+ label)


.............. Started model fitting for toxic
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed: 14.1min remaining:   29.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 14.2min finished


.............. Completed model fitting for toxic

Best paramters for toxic

.............. Completed model Prediction for toxic

.............. Started model fitting for severe_toxic
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed: 16.6min remaining:   34.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 16.6min finished


.............. Completed model fitting for severe_toxic

Best paramters for severe_toxic

.............. Completed model Prediction for severe_toxic

.............. Started model fitting for obscene
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed: 16.9min remaining:   35.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 17.0min finished


.............. Completed model fitting for obscene

Best paramters for obscene

.............. Completed model Prediction for obscene

.............. Started model fitting for threat
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed: 20.3min remaining:   41.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 20.4min finished


.............. Completed model fitting for threat

Best paramters for threat

.............. Completed model Prediction for threat

.............. Started model fitting for insult
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed: 15.1min remaining:   31.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 15.1min finished


.............. Completed model fitting for insult

Best paramters for insult

.............. Completed model Prediction for insult

.............. Started model fitting for identity_hate
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed: 15.8min remaining:   32.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 15.8min finished


.............. Completed model fitting for identity_hate

Best paramters for identity_hate

.............. Completed model Prediction for identity_hate


In [1]:
submission_hyper_parm_logi.to_csv("submission_hyper_parm_logi.csv",index=False)

NameError: name 'submission_hyper_parm_logi' is not defined