In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
pd.set_option('max_colwidth',240)


In [None]:
import os
print(os.listdir("../input"))
train=pd.read_csv("../input/train.csv")

In [None]:
def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
sentences = train["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)

In [None]:
def clean_text(x):

    x = str(x)
    for punct in "/-":
        x = x.replace(punct, ' ')
    for punct in '"“”':
        x = x.replace(punct, ' _quote_ ')
    for punct in '?!.,\'#$&>()*+-/:;<=@[\\]^_`{|}~' + '’':
        x = x.replace(punct, '')
    return x

In [None]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_text(x))
sentences = train["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

In [None]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    
    return x

In [None]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

In [None]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'wouldnt':'would not',
                'isnt':'is not',
                'wouldnt':'would not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social_media',
                'whatsapp': 'social_media',
                'snapchat': 'social_media'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [None]:
train["question_text"] = train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

In [None]:
from sklearn import model_selection
X = train['question_text']; y = train['target']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
Countv = CountVectorizer(binary=True, ngram_range=(1, 3)).fit(X_train)
Count_train_binary = Countv.transform(X_train); Count_test_binary = Countv.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

CountLR = LogisticRegression().fit(Count_train_binary, y_train); predCountLR = CountLR.predict_proba(Count_test_binary)[:,1]
fprCountLR, tprCountLR, thresholdCountLR= metrics.roc_curve(y_test, predCountLR)
roc_aucCountLR = metrics.auc(fprCountLR, tprCountLR)
CountLRpredictions = CountLR.predict(Count_test_binary)
F1CountLR = metrics.f1_score(y_test, CountLRpredictions)

In [None]:
import numpy as np
from sklearn import metrics
pr, re, th = metrics.precision_recall_curve(y_test, predCountLR)
pr, re = pr[:-1], re[:-1]
fs = 2*np.divide(np.multiply(pr, re), np.add(pr, re))
f = F1CountLR

In [None]:
opt_thr = th[np.argmax(fs)]
opt_thr

In [None]:
testdf = pd.read_csv('../input/test.csv',index_col='qid')
Xt = testdf['question_text']
Test_binary = Countv.transform(Xt)
PredictionP = CountLR.predict_proba(Test_binary)[:,1]
Predictions = (PredictionP > opt_thr).astype(int)

In [None]:
testdf['prediction']=Predictions

In [None]:
testdf.drop('question_text',axis=1,inplace=True)

In [None]:
testdf.to_csv('submission.csv')