In [1]:
import numpy as np
np.random.seed(31)
import pandas as pd
from sklearn.cross_validation import train_test_split
from subprocess import check_output



In [2]:
#LOAD DATA

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
sample = pd.read_csv("../input/sample_submission.csv")
sample.head(2)

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy


In [3]:
#CREATE TARGET VARIABLE
train["happy"] = (train.Is_Response=="happy")*1
train["not_happy"] = (train.Is_Response=="not happy")*1
train.drop("Is_Response", 1, inplace=True)
target_vars = ["happy", "not_happy"]
train.head(2)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,happy,not_happy
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,0,1
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,0,1


In [4]:
#STEMMING WORDS
import nltk.stem as stm
import re
stemmer = stm.SnowballStemmer("english")
train["stem_text"] = train.Description.apply(lambda x: (" ").join([stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]"," ", x).split(" ")]))
test["stem_text"] = test.Description.apply(lambda x: (" ").join([stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]"," ", x).split(" ")]))
train.head(3)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,happy,not_happy,stem_text
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,0,1,the room was kind of clean but had a veri stro...
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,0,1,i stay at the crown plaza april april ...
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,0,1,i book this hotel through hotwir at the lowest...


In [5]:
#PROCESS TEXT: RAW
from keras.preprocessing.text import Tokenizer
tok_raw = Tokenizer()
tok_raw.fit_on_texts(train.Description.str.lower())
tok_stem = Tokenizer()
tok_stem.fit_on_texts(train.stem_text)
train["seq_text_stem"] = tok_stem.texts_to_sequences(train.stem_text)
test["seq_text_stem"] = tok_stem.texts_to_sequences(test.stem_text)
train.head(3)

Using TensorFlow backend.


Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,happy,not_happy,stem_text,seq_text_stem
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,0,1,the room was kind of clean but had a veri stro...,"[1, 10, 5, 406, 9, 50, 19, 26, 3, 24, 1158, 40..."
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,0,1,i stay at the crown plaza april april ...,"[6, 16, 15, 1, 1254, 811, 1349, 1349, 1, 38, 5..."
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,0,1,i book this hotel through hotwir at the lowest...,"[6, 117, 21, 11, 262, 1069, 15, 1, 2480, 104, ..."


In [6]:
#EXTRACT DATA FOR KERAS MODEL
from keras.preprocessing.sequence import pad_sequences
def get_keras_data(dataset, maxlen=20):
    X = {
        "stem_input": pad_sequences(dataset.seq_text_stem, maxlen=maxlen)
    }
    return X


maxlen = 60
dtrain, dvalid = train_test_split(train, random_state=123, train_size=0.85)
X_train = get_keras_data(dtrain, maxlen)
y_train = np.array(dtrain[target_vars])
X_valid = get_keras_data(dvalid, maxlen)
y_valid = np.array(dvalid[target_vars])
X_test = get_keras_data(test, maxlen)

n_stem_seq = np.max( [np.max(X_valid["stem_input"]), np.max(X_train["stem_input"])])+1

In [9]:
#KERAS MODEL DEFINITION
from keras.layers import Dense, Dropout, Embedding
from keras.layers import Flatten, Input, SpatialDropout1D
from keras.models import Model
from keras.optimizers import Adam 

def get_model():
    embed_dim = 50
    dropout_rate = 0.9
    emb_dropout_rate = 0.9
   
    input_text = Input(shape=[maxlen], name="stem_input")
    
    emb_lstm = SpatialDropout1D(emb_dropout_rate) (Embedding(n_stem_seq, embed_dim
                                                ,input_length = maxlen
                                                               ) (input_text))
    dense = Dropout(dropout_rate) (Dense(512) (Flatten() (emb_lstm)))
    
    output = Dense(2, activation="softmax")(dense)

    model = Model([input_text], output)

    model.compile(loss="categorical_crossentropy", optimizer='nadam', metrics=["accuracy"])
    return model

model = get_model()
model.summary()
    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
stem_input (InputLayer)      (None, 60)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 60, 50)            1587250   
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 60, 50)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 3000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               1536512   
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 1026      
Total para

In [10]:
#TRAIN KERAS MODEL
model = get_model()
model.fit(X_train, y_train, epochs=27
          , validation_data=[X_valid, y_valid]
         , batch_size=1024)

Train on 33092 samples, validate on 5840 samples
Epoch 1/27
Epoch 2/27
Epoch 3/27
Epoch 4/27
Epoch 5/27
Epoch 6/27
Epoch 7/27
Epoch 8/27
Epoch 9/27
Epoch 10/27
Epoch 11/27
Epoch 12/27
Epoch 13/27
Epoch 14/27
Epoch 15/27
Epoch 16/27
Epoch 17/27
Epoch 18/27
Epoch 19/27
Epoch 20/27
Epoch 21/27
Epoch 22/27
Epoch 23/27
Epoch 24/27
Epoch 25/27
Epoch 26/27
Epoch 27/27


<keras.callbacks.History at 0x7efe06dd57f0>

In [11]:
#MODEL EVALUATION
from sklearn.metrics import accuracy_score

preds_train = model.predict(X_train)
preds_valid = model.predict(X_valid)

print(accuracy_score(preds_train.argmax(axis=1), y_train[0:,1]))
print(accuracy_score(preds_valid.argmax(axis=1), y_valid[0:,1]))

# 0.924573915146
# 0.83647260274

0.93512027076
0.831678082192


In [14]:
#PREDICTION
preds = pd.DataFrame(model.predict(get_keras_data(train, maxlen)), columns=['cnn_target_1', 'cnn_target_2'])
submission = pd.concat([train["User_ID"],preds], 1)
submission.to_csv("./cnn_train.csv", index=False)
submission.head()

Unnamed: 0,User_ID,cnn_target_1,cnn_target_2
0,id10326,0.206745,0.793256
1,id10327,0.691456,0.308544
2,id10328,0.031447,0.968553
3,id10329,0.999429,0.000571
4,id10330,0.675902,0.324098


In [15]:
#PREDICTION
preds = pd.DataFrame(model.predict(X_test), columns=['cnn_target_1', 'cnn_target_2'])
submission = pd.concat([test["User_ID"],preds], 1)
submission.to_csv("./cnn_test.csv", index=False)
submission.head()

Unnamed: 0,User_ID,cnn_target_1,cnn_target_2
0,id80132,0.044359,0.955641
1,id80133,0.990811,0.00919
2,id80134,0.088781,0.911219
3,id80135,0.040637,0.959363
4,id80136,0.999337,0.000663
