In [1]:
import os, math
import numpy as np 
import pandas as pd 
import itertools


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, CuDNNLSTM, Dense
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
#nltk model 
from nltk.tokenize import RegexpTokenizer

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report


import matplotlib
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt


Using TensorFlow backend.


In [2]:
#pd.set_option('display.height', 1000)
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

## 1.Explore dataframe features

In [3]:
train_df = pd.read_csv("../input/train.csv")
# 1. fill up the missing values
test_df =pd.read_csv("../input/test.csv")
print(train_df.head())
print(test_df.head())

# 2. Are there overlaps between train and test? No
print(pd.core.common.intersection(train_df['question_text'], test_df['question_text']).tolist())
print(pd.core.common.intersection(train_df['qid'], test_df['qid']).tolist())

#3 Some data features
# print('train data',train_df.info())
#print('test data',test_df.info())
#Are there replicated rows? No
#print(train_df.nunique())



                    qid                                      question_text  \
0  00002165364db923c7e6  How did Quebec nationalists see their province...   
1  000032939017120e6e44  Do you have an adopted dog, how would you enco...   
2  0000412ca6e4628ce2cf  Why does velocity affect time? Does velocity a...   
3  000042bf85aa498cd78e  How did Otto von Guericke used the Magdeburg h...   
4  0000455dfa3e01eae3af  Can I convert montra helicon D to a mountain b...   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  
                    qid                                      question_text
0  00014894849d00ba98a9  My voice range is A2-C5. My chest voice goes u...
1  000156468431f09b3cae           How much does a tutor earn in Bangalore?
2  000227734433360e1aae  What are the best made pocket knives under $20...
3  0005e06fbe3045bd2a92  Why would they add a hypothetical scenario tha...
4  00068a0f7f41f50fc399   What is the dresscode for Techmahindra freshers?
[]
[]


## 2. Feature extraction from text

In [4]:
#1. Preprocession: Lowercase, stemming, lemmarization, stopwords
def standardize_text(df, question_field):
    df[question_field] = df[question_field].str.replace(r"http\S+", "")
    df[question_field] = df[question_field].str.replace(r"http", "")
    df[question_field] = df[question_field].str.replace(r"@\S+", "")
    df[question_field] = df[question_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[question_field] = df[question_field].str.replace(r"@", "at")
    df[question_field] = df[question_field].str.lower()
    return df
# 2. 
train_clean = train_df.copy(deep=True) # modification of the orginial df will not be affected 
test_clean = test_df.copy(deep=True)
train_clean = standardize_text(train_clean, 'question_text')
test_clean = standardize_text(test_clean, 'question_text')
# 3. Are there overlaps between train and test question_text after preprocession? Yes
print(pd.core.common.intersection(train_clean['question_text'], test_clean['question_text']).tolist())


['what does   mean?', 'what is the difference between   and  ?', 'what are the ways to avoid unwanted sexual attractions?', 'how we can find happiness?', 'who is present health minister of india?', 'what does a woman do when she loves her boyfriend but he doesn t want to have sex? that s right! he doesn t want to have sex ']


In [5]:
# embdedding setup
# Source https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
embeddings_index = {}
f = open('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [03:24, 10737.21it/s]

Found 2196016 word vectors.





In [6]:
train_df, val_df = train_test_split(train_clean, test_size=0.1)

In [7]:
# Convert values to embeddings
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:30]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (30 - len(embeds))
    return np.array(embeds)

# train_vects = [text_to_array(X_text) for X_text in tqdm(train_df["question_text"])]
val_vects = np.array([text_to_array(X_text) for X_text in tqdm(val_df["question_text"][:3000])])
val_y = np.array(val_df["target"][:3000])

100%|██████████| 3000/3000 [00:00<00:00, 13259.73it/s]


In [8]:
# Data providers
batch_size = 64

def batch_gen(train_df):
    n_batches = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            yield text_arr, np.array(train_df["target"][i*batch_size:(i+1)*batch_size])

In [9]:
from keras.models import Sequential
from keras.layers import CuDNNLSTM, Dense, Bidirectional

In [10]:
model = Sequential()
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True),
                        input_shape=(30, 300)))
model.add(Bidirectional(CuDNNLSTM(64)))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [11]:
mg = batch_gen(train_df)
model.fit_generator(mg, epochs=20,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f1221ef5c50>

In [12]:
#prediction part
batch_size = 256
def batch_gen(test_df):
    n_batches = math.ceil(len(test_df) / batch_size)
    for i in range(n_batches):
        texts = test_df.iloc[i*batch_size:(i+1)*batch_size, 1]
        text_arr = np.array([text_to_array(text) for text in texts])
        yield text_arr

test_df = pd.read_csv("../input/test.csv")

all_preds = []
for x in tqdm(batch_gen(test_df)):
    all_preds.extend(model.predict(x).flatten())

221it [00:23,  9.39it/s]


In [13]:
y_te = (np.array(all_preds) > 0.5).astype(np.int)

submit_df = pd.DataFrame({"qid": test_df["qid"], "prediction": y_te})
submit_df.to_csv("submission.csv", index=False)

In [14]:
!head submission.csv

qid,prediction
00014894849d00ba98a9,0
000156468431f09b3cae,0
000227734433360e1aae,0
0005e06fbe3045bd2a92,0
00068a0f7f41f50fc399,0
000a2d30e3ffd70c070d,0
000b67672ec9622ff761,0
000b7fb1146d712c1105,0
000d665a8ddc426a1907,0
