In [1]:
import pandas as pd
import pickle
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz
import gensim
from nltk.corpus import stopwords
from rouge import Rouge
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
stop_words = stopwords.words('english')

In [3]:
rouge = Rouge()

In [4]:
# wmd_model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True)
# wmd_norm_model = norm_model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True)
# wmd_norm_model.init_sims(replace=True)

In [26]:
train_data = pd.read_csv("../data/train_quora_features.csv")
valid_data = pd.read_csv("../data/valid_quora_features.csv")
test_data = pd.read_csv("../data/test_quora_features.csv")
train_data.dropna(inplace=True)
valid_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [5]:
def tokenize_lower_question(q):
    q = [x.lower() for x in word_tokenize(q) if x.isalpha()]
    return q

In [6]:
def get_bleu(q1, q2):
    #counting the matching n-gram of two questios
    q1 = tokenize_lower_question(q1)
    q2 = tokenize_lower_question(q2)
    smoothie = SmoothingFunction().method4
    return sentence_bleu([q1], q2, smoothing_function=smoothie)

In [62]:
def get_rouge_1(q1, q2):
    scores = rouge.get_scores(q1, q2)
    return scores[0]["rouge-1"]['f']

def get_rouge_2(q1, q2):
    scores = rouge.get_scores(q1, q2)
    return scores[0]["rouge-2"]['f']

def get_rouge_l(q1, q2):
    scores = rouge.get_scores(q1, q2)
    return scores[0]["rouge-l"]['f']

In [8]:
def get_levenshteinDistance(s1, s2):
    """Edit distance"""
    s1 = tokenize_lower_question(s1)
    s2 = tokenize_lower_question(s2)
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [63]:
def get_num_of_capital(q1):
    num_capital = sum([1 for x in q1 if x.isupper()])
    
    return num_capital

def get_num_of_question_mark(q1):
    num_question_mark = sum([1 for x in q1 if x == '?' ])
    return num_question_mark

In [64]:
def feature_engineer(df):    
    
    #edit distance
#     df["edit_distance"] = df.apply(lambda x: get_levenshteinDistance(x.question1, x.question2), axis=1)
    
    
    
    #rouge
    df["rouge_1"] = df.apply(lambda x: get_rouge_1(x.question1, x.question2), axis=1)
    df["rouge_2"] = df.apply(lambda x: get_rouge_2(x.question1, x.question2), axis=1)
    df["rouge_l"] = df.apply(lambda x: get_rouge_l(x.question1, x.question2), axis=1)
    
    #get special
    df["q1_capital"] = df.apply(lambda x: get_num_of_capital(x.question1), axis=1)
    df["q2_capital"] = df.apply(lambda x: get_num_of_capital(x.question2), axis=1)
    df["capital_diff"] = (df["q1_capital"] - df["q2_capital"]).abs()
    
    df["q1_question_mark"] = df.apply(lambda x: get_num_of_question_mark(x.question1), axis=1)
    df["q2_question_mark"] = df.apply(lambda x: get_num_of_question_mark(x.question2), axis=1)
    df["question_mark_diff"] = (df["q1_question_mark"] - df["q2_question_mark"]).abs()


#bleu
    #     df["bleu"] = df.apply(lambda x: get_bleu(x.question1, x.question2), axis=1)
    return df

In [79]:
train_data = feature_engineer(train_data)
# train_data.to_csv('../data/train_quora_features.csv', index=False)

In [80]:
valid_data = feature_engineer(valid_data)
# valid_data.to_csv('../data/valid_quora_features.csv', index=False)

In [67]:
y_train = train_data["is_duplicate"]
X_train = train_data.drop(["Unnamed: 0", "id","qid1","qid2","question1","question2","is_duplicate"], axis=1)

In [68]:
y_valid = valid_data["is_duplicate"]
X_valid = valid_data.drop(["Unnamed: 0", "id","qid1","qid2","question1","question2","is_duplicate"], axis=1)

In [87]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import log_loss
# from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

In [69]:
X_train.replace(np.inf, 7, inplace=True)
X_valid.replace(np.inf, 7, inplace=True)

In [70]:
# param = {'num_leaves':31, 'num_trees':100, 'objective':'binary'}
# param['metric'] = ['binary_logloss']
# num_round = 10

In [71]:
# lgb_train = lgb.Dataset(X_train, label=y_train)
# lgb_valid = lgb.Dataset(X_valid, label=y_valid)

In [91]:
rf = RandomForestClassifier(n_estimators=100)
gb = GradientBoostingClassifier()
sgd = SGDClassifier(penalty="elasticnet", loss='log')
clf = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('sgd',sgd)], voting='soft')

In [92]:
clf.fit(X_train, y_train)



VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...cnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [93]:
predict = clf.predict_proba(X_valid)

In [94]:
log_loss(y_valid, predict)

0.49190131835102485

In [48]:
def build_graph(data):
    graph = {}
    for i in range(data.shape[0]):
        sample = data.iloc[i]
        
        q1_id, q2_id, label = sample.qid1, sample.qid2, sample.is_duplicate
        if int(label) == 0:
            continue
        if q1_id not in graph:
            graph[q1_id] = []
        graph[q1_id].append(q2_id)
        
        if q2_id not in graph:
            graph[q2_id] = []
        graph[q2_id].append(q1_id)

    return graph

In [59]:
with open("../data/train_graph.pickle",'rb') as f:
    train_graph = pickle.load(f)


In [50]:
def is_neighbor(graph, q1_id, q2_id):
    seen = set()
    queue = [q1_id]
    while len(queue) > 0:
        cur = queue.pop(0)
        seen.add(cur)
        if q2_id == cur:
            return 1
        else:
            if cur not in graph:
                return 0
            for new in graph[cur]:
                if new not in seen:
                    queue.append(new)
    return 0

In [76]:
def modify_predict(graph, test_data, predict):
    modified = np.copy(predict)
    for i in range(test_data.shape[0]):
        sample = test_data.iloc[i]
        q1_id, q2_id = sample.qid1, sample.qid2
        if is_neighbor(graph, q1_id, q2_id) == 1:
            modified[i][0] = 0
            modified[i][1] = 1
    return modified

In [95]:
modified_pred = modify_predict(train_graph, valid_data, predict)

In [96]:
log_loss(y_valid, modified_pred)

0.30264549627254494