### Introduction
1. The original code is from https://github.com/aerdem4/kaggle-quora-dup

2. The code was for kaggle competition, [Quora](https://www.kaggle.com/c/quora-question-pairs/overview)

3. Here I haven't use **NLP feature** and **non NLP feature** from the code

### To DO List
1. Check Balance of True and False
2. Punctuation remove
3. simplize words
> For exmpale
>* was, is ,are -> be
>* ketty -> cat

4. Feature Engineering Generating




**Note** `#@@@@@@@@@@@@@` means the block is for testing

In [3]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers.core import Lambda
from keras.layers.merge import concatenate, add, multiply
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.noise import GaussianNoise
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

Using TensorFlow backend.


## 1. Basic Parameter Setting

In [27]:
simple_pro =True
np.random.seed(0)
WNL = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))

#Define the max lenght of the question
MAX_SEQUENCE_LENGTH = 500
MIN_WORD_OCCURRENCE = 100
REPLACE_WORD = "memento"
EMBEDDING_DIM = 300
NUM_FOLDS = 2
BATCH_SIZE = 1025
EMBEDDING_FILE = "glove.840B.300d.txt"

## 2. Supported Function

In [20]:
def cutter(word):
    if len(word) < 4:
        return word
    return WNL.lemmatize(WNL.lemmatize(word, "n"), "v")


def preprocess(string):
    #*****************************************************************************
    #how they know what catractore has to be replace? 
    string = string.lower().replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'") \
        .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not") \
        .replace("n't", " not").replace("what's", "what is").replace("it's", "it is") \
        .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are") \
        .replace("he's", "he is").replace("she's", "she is").replace("'s", " own") \
        .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ") \
        .replace("€", " euro ").replace("'ll", " will").replace("=", " equal ").replace("+", " plus ")
    string = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', string)
    string = re.sub(r"([0-9]+)000000", r"\1m", string)
    string = re.sub(r"([0-9]+)000", r"\1k", string)
    string = ' '.join([cutter(w) for w in string.split()])
    return string


def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE)
    for line in f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in top_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    return embeddings_index


def is_numeric(s):
    return any(i.isdigit() for i in s)


def prepare(q):
    #return new_q, et(surplus_q), set(numbers_q)
    # new_q: the new question inlcude top word and non-top word replaced by 'memento'
    # set(surplus_q): a word bag that is not in STOP_WORDS and not in Top word
    # set(numbers_q): a word bag of numbers. 
    q=str(q)
    q=q.strip()

    new_q = []
    surplus_q = []
    numbers_q = []
    new_memento = True
    
    
    # spilt each wrod in a article
    #[::-1] means oppoite sort, last element becomes first one
    for w in q.split()[::-1]:
        if w in top_words:
            new_q = [w] + new_q
            new_memento = True
        elif w not in STOP_WORDS:
            if new_memento:
                new_q = ["memento"] + new_q
                new_memento = False
            if is_numeric(w):
                numbers_q = [w] + numbers_q
            else:
                surplus_q = [w] + surplus_q
        else:
            new_memento = True
        if len(new_q) == MAX_SEQUENCE_LENGTH:
            break
    new_q = " ".join(new_q)
    

    return new_q, set(surplus_q), set(numbers_q)


#punchation problem
def extract_features(df):
    articles = np.array([""] * len(df), dtype=object)
    titles = np.array([""] * len(df), dtype=object)
    features = np.zeros((len(df), 4))

    for i, (article, title) in enumerate(list(zip(df["article"], df["title"]))):
        articles[i], surplus1, numbers1 = prepare(article)
        titles[i], surplus2, numbers2 = prepare(title)
        
        features[i,0] = len(surplus1)
        features[i,1]=len(numbers1)
        features[i,2]=len(surplus2)
        features[i,3]=len(numbers2)
#         features[i, 0] = len(surplus1.intersection(surplus2))
#         features[i, 1] = len(surplus1.union(surplus2))
#         features[i, 2] = len(numbers1.intersection(numbers2))
#         features[i, 3] = len(numbers1.union(numbers2))

    return articles, titles, features

In [15]:
#@@@@@@@@@@@@@@@@@
train.title

0    After DeVos Announced Plans To Reexamine Title...
1    University To Award Trayvon Martin With Posthu...
2    Texas State University suspends Greek life aft...
3    Red Sox waste Rodriguez outing in 1-0 loss to ...
4                            Eve and the New Jerusalem
Name: title, dtype: object

## 3. Data prepare
---


### 3.1 Train, Test data 
* Reading data
* Replace the words and clean the NA cells
* Find the unique questions (Clean the duplicate data)
* Convert all_questions to vector 
* Find Top Words, meaning find the words appear more than 100 times in all data(train+test)


In [8]:
train = pd.read_csv("../data/train.csv")[0:200]
test = pd.read_csv("../data/test.csv")[0:200]
# if simple_pro ==True:
#     train=train[0:200].copy()
#     test=test[0:200].copy()



### 3.2 Find top_words

In [9]:
all_article=pd.Series(train['article'].tolist()+test['article'].tolist())
vectorizer = CountVectorizer(lowercase=False, token_pattern="\S+", min_df=MIN_WORD_OCCURRENCE,
                             stop_words=STOP_WORDS)
vectorizer.fit(all_article.tolist())

#`top_words` will be use in function `get_embedding`
top_words = set(vectorizer.vocabulary_.keys())
top_words.add(REPLACE_WORD)

`max_df` is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

> * max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
* max_df = 25 means "ignore terms that appear in more than 25 documents".
* The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

`min_df` is used for removing terms that appear too infrequently. For example:

>* min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
* min_df = 5 means "ignore terms that appear in less than 5 documents".
* The default min_df is 1, which means "ignore terms that appear in less than 1 document". * Thus, the default setting does not ignore any terms.


[reference](https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer)

In [None]:
#@@@@@@@@@@@@@
#1. 找轉換工具 for example:  is, are -> be
#2. 清除沒意義的字
#3. lower case ever words
# print( len(top_words), len(vectorizer.vocabulary_.keys()))

#show the contain of vectorizer
for dic in list(vectorizer.vocabulary_.items())[:100]:
    print(dic)


### 3.2 Pre-Train vector: glove.840B.300d
* Find the words from `top_words` in **glove.840B.300d** and extract words as vector
> Retriving the vector from pre-trained model
> 1. get the `top_words` 
> 2. make sure the `top_words` is in the pre-train model
> 3. load the words which are in top_word and pre-train model **ONLY**.  

In [10]:
#get the embedding word and coefficient from pre-train model.
embeddings_index = get_embedding()

# we can see some words that are not in the pre-train model,glove.840B.300d. 
print("Words are not found in the embedding:", top_words - embeddings_index.keys())

#reset top_words 
top_words = embeddings_index.keys()


Words are not found in the embedding: set()


## 4. Feature Engineering
---
### Build up features
The purpose of this section is to 
* Shape the question 1 and question 2 to the training format, so that we can feed them into the NN model. 
* Wrap the features together, **nlp_features**, **non_nlp featues**, **train_q_features**

**The training data is the follwing:**
> - article: 
- train_nlp_features: nlp features of traning data
- train_non_nlp_features: non-nlp features of tranining data
- train_q_features: 


### Function Explaination
* `S.pad_sequences([[1,2,3]], maxlen=10, padding='post')`
 
    return  `[[1, 2, 3, 0, 0, 0, 0, 0, 0, 0]]`


* `tokenizer.texts_to_sequences` like str.split()

    text= 'soem thing to eat'
    
    T = tokenizer.fit_on_texts(text)
    
    print( T.text_to_word_sequence(text1))  # ['some', 'thing', 'to', 'eat']

    print( tokenizer.word_index)   #{'some': 1, 'thing': 2,'to': 3 ','eat': 4, drink': 5}

# extract_features

* feature, 把suplus 轉成vector 去預測target
* mix title and article as a new feature 

In [66]:
# Train
articles_train, titles_train, features_train = extract_features(train)
#建立tokenizer
# extract word from all articles in Train
tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(articles_train)
word_index = tokenizer.word_index

# creat a mextrix: (num of article, 500)
data_articles_train = pad_sequences(tokenizer.texts_to_sequences(articles_train), maxlen=MAX_SEQUENCE_LENGTH)
data_titles_train = pad_sequences(tokenizer.texts_to_sequences(titles_train), maxlen=MAX_SEQUENCE_LENGTH)
labels_train = np.array(train['hyperpartisan'])

# features_train = np.hstack((features_train))

In [67]:
# Test
articles_test, title_test, features_test =extract_features(test)
data_articles_test = pad_sequences(tokenizer.texts_to_sequences(articles_test), maxlen=MAX_SEQUENCE_LENGTH)
data_titles_test = pad_sequences(tokenizer.texts_to_sequences(title_test), maxlen=MAX_SEQUENCE_LENGTH)
labels_test = np.array(test['hyperpartisan'])
# features_test = np.hstack((features_test))

In [69]:
# Convert the vector of words in pre-train model 
# that words has used in training set to embedding_matrix 
nb_words = len(word_index) + 1  # 3867
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## 5. Model

<img src="image/model.png">

In [71]:

def LSTM_model(): 
    embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
    lstm_layer = LSTM(75, recurrent_dropout=0.2)

    # artile and title input formating
    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    # feature input formating
    features_input = Input(shape=(f_train.shape[1],), dtype="float32")
    features_dense = BatchNormalization()(features_input)
    features_dense = Dense(200, activation="relu")(features_dense)
    features_dense = Dropout(0.2)(features_dense)

    #blend article and title input by Square Differece
    addition = add([x1, y1])
    minus_y1 = Lambda(lambda x: -x)(y1)
    merged = add([x1, minus_y1])
    merged = multiply([merged, merged])
    merged = concatenate([merged, addition])
    merged = Dropout(0.4)(merged)

    
    merged = concatenate([merged, features_dense])
    merged = BatchNormalization()(merged)
    merged = GaussianNoise(0.1)(merged)

    merged = Dense(150, activation="relu")(merged)
    merged = Dropout(0.2)(merged)
    merged = BatchNormalization()(merged)

    out = Dense(1, activation="sigmoid")(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input, features_input], outputs=out)
    model.compile(loss="binary_crossentropy",
                  optimizer="nadam")
    model.summary

    return (model)
    

In [73]:
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True)
model_count = 0

for idx_train, idx_val in skf.split(train, train['hyperpartisan']):
    print("MODEL:", model_count)
    data_articles_intrain = data_articles_train[idx_train]
    data_title_intrain = data_titles_train[idx_train]
    labels_intrain = labels_train[idx_train]
    f_intrain = features_train[idx_train]

    data_articles_val = data_articles_train[idx_val]
    data_title_val = data_titles_train[idx_val]
    labels_val = labels_train[idx_val]
    f_val = features_train[idx_val]
    
    model = LSTM_model()
    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    best_model_path = "best_model" + str(model_count) + ".h5"
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)
    
    
    hist = model.fit([data_articles_intrain, data_title_intrain, f_intrain], labels_intrain,
                     validation_data=([data_articles_val, data_title_val, f_val], labels_val),
                     epochs=1, batch_size=BATCH_SIZE, shuffle=True,
                     callbacks=[early_stopping, model_checkpoint], verbose=1)

    model.load_weights(best_model_path)
    print(model_count, "validation loss:", min(hist.history["val_loss"]))

    preds = model.predict([data_articles_test, data_titles_test, features_test], batch_size=BATCH_SIZE, verbose=1)

    model_count += 1


MODEL: 0
Train on 99 samples, validate on 101 samples
Epoch 1/1
0 validation loss: 1.0251847505569458
MODEL: 1
Train on 101 samples, validate on 99 samples
Epoch 1/1
1 validation loss: 0.8869453072547913
