In [46]:
import spacy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [47]:
def bow2vec(txt,nlp):
    """
    Converts txt to a list of word vectors,"bag of words" encoding,
    and then averages the vectors to produce a single vector encoding
    """
    txt = txt.strip()
    words = nlp(txt)
    
    #already proved words.vector is average of words
    #vecs = [word.vector for word in words ]

    return words.vector


In [52]:
df = pd.read_csv("quora_duplicate_questions.tsv", delimiter="\t",nrows=1000)
#list(df)
#['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']
df = df[['question1','question2','is_duplicate']]
X =df[['question1','question2']].values
Y = df[['is_duplicate']].values
print(type(X),type(Y),X.shape,Y.shape)
X_train,X_dev,Y_train,Y_dev = train_test_split(X,Y,test_size=.20,random_state=4)
print(X_train.shape, X_dev.shape,Y_train.shape,Y_dev.shape)
nlp = spacy.load('en_core_web_lg',disable=['parser', 'tagger', 'ner'])
q1_vec = [bow2vec(x,nlp) for x in df['question1'].values]
q2_vec = [bow2vec(x,nlp) for x in df['question2'].values]
X=np.hstack((q1_vec,q2_vec))
print(X.shape)
X_train,X_dev,Y_train,Y_dev = train_test_split(X,Y,test_size=.20,random_state=4)
print(X_train.shape, X_dev.shape,Y_train.shape,Y_dev.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> (1000, 2) (1000, 1)
(800, 2) (200, 2) (800, 1) (200, 1)
(1000, 600)
(800, 600) (200, 600) (800, 1) (200, 1)


In [68]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import *
df = load_df("quora_duplicate_questions.tsv")  
df = df[0:1000]
    
# build data features
X = make_features2(df,method='spacy')
y = df["is_duplicate"]
    
# split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state = 4)
print("Train shape:",X_train.shape)
print("Test shape:",X_test.shape)     

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, 
                                                        random_state = 4)
print("Train shape:",X_train.shape)
print("Test shape:",X_test.shape)     
    
# train model
print("Training model...")
model = LogisticRegression()
model.fit(X_train, y_train)   

# make predictions on test data
y_pred = model.predict(X_test)

# print results
print("\nTrain Accuracy : %0.2f%%"%(100*accuracy_score(y_train, model.predict(X_train))))
print("Test Accuracy  : %0.2f%%"%(100*accuracy_score(y_test,y_pred)))
print("Test logloss   : %0.2f"%log_loss(y_test,model.predict_proba(X_test)[:,1]))
print()
print(classification_report(y_test, y_pred, target_names=['not duplicate','is duplicate']))


Train shape: (800, 902)
Test shape: (200, 902)
Training model...

Train Accuracy : 87.75%
Test Accuracy  : 68.00%
Test logloss   : 0.62

               precision    recall  f1-score   support

not duplicate       0.71      0.80      0.75       121
 is duplicate       0.62      0.49      0.55        79

    micro avg       0.68      0.68      0.68       200
    macro avg       0.66      0.65      0.65       200
 weighted avg       0.67      0.68      0.67       200





In [130]:
from scipy.spatial.distance import cosine
def load_df(filename):
    """
    Load quora duplicate question data from tsv file
    """
    df = pd.read_csv(filename, sep='\t',  encoding = 'utf8',keep_default_na=False)
    print("length of orignal data set: ", len(df))
    
    df = df.drop(['id', 'qid1', 'qid2'], axis=1)

    # drop rows wo questions
    df = df[df['question1']!='']
    df = df[df['question2']!='']

    print("length of processed data set: ",len(df))
    return df

def make_features(df,nlp=nlp,method='spacy'):
    """
    Build similarity features from a pandas dataframe of question pairs.
    """
    fasttext = None
    
    if method=='spacy':
        print("Loading spacy 'en_core_web_lg'...")
        nlp = spacy.load('en_core_web_lg',disable=['parser', 'tagger', 'ner'])
        
    elif method=='fasttext':
        import fastText as ft
        print("Loading fasttext embeddings...")        
        fasttext = ft.load_model('/data/demo_quora_data/crawl-300d-2M-subword.bin')   
         
    print("Vectorizing question1...")
    q1_vec = [vec(q,nlp=nlp,fasttext=fasttext,method=method) for q in  df['question1'].values]
    print("...Finished vectorizing question 1")

    print("Vectorizing question2...")
    q2_vec = [vec(q,nlp=nlp,fasttext=fasttext,method=method) for q in  df['question2'].values]
    print("...Finished vectorizing question2")
    
    # BoW difference vector
    #bow_diff=np.array([abs(q2 - q1) for (q1,q2) in zip(q1_vec,q2_vec)])
    
    # BoW cosine feature
    #print("Building BoW cosine similarity...")
    cos_sim = np.array([cosine_similarity(q1,q2) for (q1,q2) in zip(q1_vec,q2_vec)])

    # BoW distance feature
    #print("Building BoW euclidean similarity...")    
    #euclidean_sim = np.array([np.linalg.norm(q2 - q1) for (q1,q2) in zip(q1_vec,q2_vec)])

    # BoW sum vector
    #bow_sum=np.array([ q1 + q2 for (q1,q2) in zip(q1_vec,q2_vec)])
    #df['cos_sim'] = cos_sim
    #df['euclidean_sim'] = euclidean_sim
    
    #X = np.hstack((q1_vec, 
    #               q2_vec, 
    #               bow_diff, 
    #               cos_sim.reshape(-1,1),
    #               euclidean_sim.reshape(-1,1)))
    X = np.hstack((q1_vec, q2_vec))
    
    return X

def cosine_similarity(v1,v2):
    return 1. - cosine(v1,v2)

def vec(txt,nlp=nlp,fasttext=None,method='spacy'):
    
    txt = txt.strip()
    txt = txt.lower()
    
    words = nlp(txt)
    words = [w for w in words]
    
    if len(words)==0:
        words=nlp(u"empty")
    
    if method == 'spacy':
        vecs = [word.vector  for word in words ]

    elif method=='fasttext':
        vecs= [fasttext.get_word_vector(word.text) for word in words]
  
    else:
        print("Error: unknown method!")
    #dont need this for spacy
    return np.mean(vecs,0)
    #return [word.vector for word in words ]
    
    
def make_features2(df,nlp=nlp,method='spacy',stopwords=True,add_to_df=True):
    """
    Build similarity features from a pandas dataframe of question pairs.
    """
    fasttext = None
    
    if method=='spacy':
        print("Loading spacy 'en_core_web_lg'...")
        nlp = spacy.load('en_core_web_lg',disable=['parser', 'tagger', 'ner'])
        
    elif method=='fasttext':
        import fastText as ft
        print("Loading fasttext embeddings...")        
        fasttext = ft.load_model('/data/demo_quora_data/crawl-300d-2M-subword.bin')   
         
    print("Vectorizing question1...")
    q1_vec = [vec(q,nlp=nlp,fasttext=fasttext,method=method) 
                for q in  df['question1'].values]
    print("...Finished vectorizing question 1")

    print("Vectorizing question2...")
    q2_vec = [vec(q,nlp=nlp,fasttext=fasttext,method=method) 
                for q in  df['question2'].values]
    print("...Finished vectorizing question2")
    
    # BoW difference vector
    bow_diff=np.array([abs(q2 - q1) for (q1,q2) in zip(q1_vec,q2_vec)])
    
    # BoW cosine feature
    print("Building BoW cosine similarity...")
    cos_sim = np.array([cosine_similarity(q1,q2) for (q1,q2) in zip(q1_vec,q2_vec)])

    # BoW distance feature
    print("Building BoW euclidean similarity...")    
    euclidean_sim = np.array([np.linalg.norm(q2 - q1) for (q1,q2) in zip(q1_vec,q2_vec)])

    # BoW sum vector
    #bow_sum=np.array([ q1 + q2 for (q1,q2) in zip(q1_vec,q2_vec)])
    
    X = np.hstack((q1_vec, 
                   q2_vec, 
                   bow_diff, 
                   cos_sim.reshape(-1,1),
                   euclidean_sim.reshape(-1,1)))

    if add_to_df:
        df['cos_sim'] = cos_sim
        df['euclidean_sim'] = euclidean_sim
        
    return X




In [131]:
df = load_df("quora_duplicate_questions.tsv")  
df = df[0:1000]
    
# build data features
X = make_features(df,method='spacy')
y = df["is_duplicate"]
    
# split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state = 4)
print("Train shape:",X_train.shape)
print("Test shape:",X_test.shape)     

length of orignal data set:  404290
length of processed data set:  404288
Loading spacy 'en_core_web_lg'...
Vectorizing question1...
...Finished vectorizing question 1
Vectorizing question2...
...Finished vectorizing question2
Building BoW cosine similarity...
Building BoW euclidean similarity...
Train shape: (800, 600)
Test shape: (200, 600)


In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, 
                                                        random_state = 4)
print("Train shape:",X_train.shape)
print("Test shape:",X_test.shape)     
    
# train model
print("Training model...")
model = LogisticRegression()
model.fit(X_train, y_train)   

# make predictions on test data
y_pred = model.predict(X_test)

# print results
print("\nTrain Accuracy : %0.2f%%"%(100*accuracy_score(y_train, model.predict(X_train))))
print("Test Accuracy  : %0.2f%%"%(100*accuracy_score(y_test,y_pred)))
print("Test logloss   : %0.2f"%log_loss(y_test,model.predict_proba(X_test)[:,1]))
print()
print(classification_report(y_test, y_pred, target_names=['not duplicate','is duplicate']))

Train shape: (800, 600)
Test shape: (200, 600)
Training model...

Train Accuracy : 82.00%
Test Accuracy  : 64.00%
Test logloss   : 0.64

               precision    recall  f1-score   support

not duplicate       0.66      0.83      0.74       121
 is duplicate       0.57      0.35      0.44        79

    micro avg       0.64      0.64      0.64       200
    macro avg       0.62      0.59      0.59       200
 weighted avg       0.63      0.64      0.62       200





In [6]:
import numpy as np
import pandas as pd
import os
import math
import pickle
import argparse

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *

#from utils_infersent import load_infersent, build_features

def load_quora_df(filename):
    """Load quora duplicate question data from tsv file
    """
    df = pd.read_csv(quora_filename, sep='\t',  encoding = 'utf8',keep_default_na=False)
    print("length of orignal data set: ", len(df))
    
    df = df.drop(['id', 'qid1', 'qid2'], axis=1)

    # drop rows wo questions
    df = df[df['question1']!='']
    df = df[df['question2']!='']

    print("length of processed data set: ",len(df))
    return df

if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    parser.add_argument('--quora_filename', default='./quora_duplicate_questions.tsv',type=str,
                        help='file containing quora questions in tsv format')  
    parser.add_argument('--nrows', default=None, type=int,
                        help='number of rows of quora data to use')                        
    parser.add_argument('--model_filename', default='./model_infersent/saved_model_infersent.pkl',type=str,
                        help='pickle file containing stored model')
                          
    parser.add_argument('--use_cuda',  default=True, type=lambda x: (str(x).lower() == 'true'),
                        help='Use cuda on GPU')    
    parser.add_argument('--test_size', default=0.25,type=float,
                        help='test size fraction')     
                        
    parser.add_argument('--random_state', default=42,type=int,
                        help='random state seed')                          

    parser.add_argument('--infersent_encoder_path', 
                        default='./model_infersent/infersent2.pkl',type=str,
                        help='path to infersent enocoder model')                          

    parser.add_argument('--word_vec_path', 
                        default='./model_infersent/crawl-300d-2M.vec',type=str,
                        help='path to word vector embedding')        
                                       
    args = parser.parse_args()
    quora_filename = args.quora_filename
    nrows = args.nrows     
    model_filename = args.model_filename
    use_cuda = args.use_cuda     
    test_size = args.test_size
    random_state = args.random_state
    encoder_path = args.infersent_encoder_path
    word_vec_path = args.word_vec_path   
            
    # read quora tsv
    print("Loading quora tsv file...") 
    df = load_quora_df(quora_filename)
    if nrows: df = df[0:nrows]
    
    # get infersent encoder
    encoder = load_infersent(encoder_path,word_vec_path,use_cuda)

    # build data features
    X = build_features(df,encoder)
    y = df["is_duplicate"]
        
    # split data into training and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size, 
                                                        random_state = random_state)
                                                        
    del X, encoder, df                                          
                                                        
                                                        
    print("Train shape:",X_train.shape)
    print("Test shape:",X_test.shape)     
    
    # train model
    print("Training model...")
    #model = LogisticRegression()
    model = SGDClassifier(loss='modified_huber',tol=1e-4,verbose=1,random_state=random_state)
    model.fit(X_train, y_train)   

    # make predictions on test data
    y_pred = model.predict(X_test)

    # print results
    print("\nTrain Accuracy : %0.2f%%"%(100*accuracy_score(y_train, model.predict(X_train))))
    print("Test Accuracy  : %0.2f%%"%(100*accuracy_score(y_test,y_pred)))
    print("Test logloss   : %0.2f"%log_loss(y_test,model.predict_proba(X_test)[:,1]))
    print()
    print(classification_report(y_test, y_pred, target_names=['not duplicate','is duplicate']))


    #save model
    with open(model_filename, 'wb') as file:  
        pickle.dump(model, file)
        
    print("Finished!")


usage: ipykernel_launcher.py [-h] [--quora_filename QUORA_FILENAME]
                             [--nrows NROWS] [--model_filename MODEL_FILENAME]
                             [--use_cuda USE_CUDA] [--test_size TEST_SIZE]
                             [--random_state RANDOM_STATE]
                             [--infersent_encoder_path INFERSENT_ENCODER_PATH]
                             [--word_vec_path WORD_VEC_PATH]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1000/jupyter/kernel-e991e4d7-c78b-4492-bffe-b2cb81222786.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
