# NLP Task 1: Artur Xarles & Enric Azuara

This notebook contains all the code needed to train and store the models on the disk. All the needed functions to compile are contained into a auxiliar file named *utils.py*.

### Import some libraries

In [1]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import os
from tqdm import tqdm
from scipy.sparse import csr_matrix
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Enric\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Split between train, validation and test

In [2]:
train_df = pd.read_csv("quora_train_data.csv")
A_df, test_df = sklearn.model_selection.train_test_split(train_df, test_size=0.05, random_state=123)
train_df, val_df = sklearn.model_selection.train_test_split(A_df, test_size=0.05, random_state=123)

### Feature engineering

In [3]:
#from utils.py import *

In [4]:
def try_divide(x, y, val=0.0):
    if y != 0.0:
        val = float(x) / y
    return val

def get_jaccard(q1, q2):
    '''0 means equal and 1 totally different
    '''
    set1, set2 = set(q1), set(q2)
    return 1 -  try_divide(len(set1 & set2), float(len(set1 | set2)))

def get_dice(q1,q2):
    
    q1, q2 = set(q1), set(q2)
    intersect = len(q1 & q2)
    union = float(len(q1) + len(q2))
    d = try_divide(2 * intersect, union)
    return d

def get_sorensen(q1, q2): 
    '''0 means equal and 1 totally different
    '''
    set1, set2 = set(q1), set(q2)
    return 1-  try_divide(2 * len(set1 & set2),float(len(set1) + len(set2)))

def get_count_words_in_both(q1, q2):
    set1, set2 = set(q1), set(q2)
    return len(set1 & set2)

def get_ratio_words_in_both(q1, q2):
    set1, set2 = set(q1), set(q2)
    try:
        return len(set1 & set2)/float(len(set1))
    except:
        return 0.0

def get_num_of_words(q1):
    return len(q1)

def get_num_of_unique_words(q1):
    set1 = set(q1)
    return len(set1)

def get_count_of_digit(q1):
    return sum([1. for k in seq1 if k.isdigit()])

def get_ratio_of_digit(q1):
    try:
        return sum([1. for k in seq1 if k.isdigit()])/float(len(seq1))
    except:
        return 0.0

def get_sim_feature(q1, q2):

    X_jaccard = np.array([ get_jaccard(x1, x2) for x1,x2 in zip(q1, q2)]).reshape(-1,1)
    X_dice = np.array([ get_dice(x1, x2)  for x1,x2 in zip(q1, q2)]).reshape(-1,1)
    X_count = np.array([ get_count_words_in_both(x1, x2)  for x1, x2 in zip(q1, q2)]).reshape(-1,1)
    X_ratio = np.array([ get_ratio_words_in_both(x1, x2)  for x1, x2 in zip(q1, q2)]).reshape(-1,1)
    X_len1 = np.array([ get_num_of_words(x1)  for x1 in  q1]).reshape(-1,1)
    X_len2 = np.array([ get_num_of_words(x2)  for x2 in  q2]).reshape(-1,1)

    X_len1_unique = np.array([ get_num_of_unique_words(x1)  for x1 in  q1]).reshape(-1,1)
    X_len2_unique = np.array([ get_num_of_unique_words(x2)  for x2 in  q2]).reshape(-1,1)

    X_len_diff = np.abs(X_len2-X_len1)


    X_sim = np.hstack([X_jaccard,X_dice,X_count,X_ratio,X_len1,X_len2,X_len1_unique,X_len2_unique,X_len_diff])
    

    return X_sim

In [5]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    #assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    mylist_of_strings = []
    for x in mylist:
        mylist_of_strings.append(str(x))

    return mylist_of_strings

'''
FUNCTION FOR CLASS COUNT_VECTORIZER
'''
class count_vectorizer:
    
    def __init__(self, sentences, tokken_pattern = r'(?u)\b\w\w+\b', lower_case = True, stop_words = False, stemming = False, lemmatization = False):
        
        
        self.lemmatization = lemmatization
        if self.lemmatization:
            self.lemmatizer = WordNetLemmatizer()
        
        self.stemming = stemming
        if self.stemming:
            self.st = SnowballStemmer('english')
        
        
        if stop_words:
            self.stop = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "doing", "a", "an", "the", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "again", "further", "then", "once", "here", "there", "own", "same", "so", "than", "s", "t", "can", "will", "just", "don", "should", "now"]
        else:
            self.stop = []
        
        self.tokken_pattern = tokken_pattern
        self.lower = lower_case
        self.documents = sentences
        self.N = len(self.documents)
        #self.sentences = ' '.join(sentences)
        #self.all_words = re.findall(r'(?u)\b\w\w+\b', self.sentences) #Get all the words that satisfy the regular expression
        self.words = [] #List of all different words
        self.word2index = {} #Dictionary from word to his index
        
        
        return
    
    def fit(self):
        for document in tqdm(self.documents):
            if self.lower:
                document = str(document).lower()
            all_words = re.findall(self.tokken_pattern, document)
            for word in all_words:
                if self.stemming:
                    word = self.st.stem(word)
                if self.lemmatization:
                    word = self.lemmatizer.lemmatize(word)
                if word not in self.stop:
                    if word in self.word2index.keys():
                        pass
                    else:
                        self.words.append(word)
                        self.word2index[word] = len(self.words) - 1                  
        return
    
    def transform(self, sentences):
        row = []
        col = []
        data = []
        i = 0 #Defines number of document (row in sparse matrix)
        for document in tqdm(sentences):
            if self.lower:
                document = str(document).lower()
            all_words = re.findall(self.tokken_pattern, str(document))
            for word in set(all_words):
                if self.stemming:
                    word = self.st.stem(word)
                if self.lemmatization:
                    word = self.lemmatizer.lemmatize(word)
                if word not in self.stop:
                    if word in self.words:
                        row.append(i) #index representing number of the word
                        col.append(self.word2index[word]) #Column of the word (index)
                        data.append(all_words.count(word)) #Number of times word appears in sentence
            i += 1
        return csr_matrix((data, (row, col)), shape=(len(sentences), len(self.words)))
    
'''
FUNCTION FOR CLASS TF_IDF
'''
class tf_idf:

    def __init__(self, sentences, tokken_pattern = r'(?u)\b\w\w+\b', lower_case = True, stop_words = False, stemming = False, lemmatization = False):
        
        
        self.lemmatization = lemmatization
        if self.lemmatization:
            self.lemmatizer = WordNetLemmatizer()
        
        self.stemming = stemming
        if self.stemming:
            self.st = SnowballStemmer('english')
        
        if stop_words:
            self.stop = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "doing", "a", "an", "the", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "again", "further", "then", "once", "here", "there", "own", "same", "so", "than", "s", "t", "can", "will", "just", "don", "should", "now"]
        else:
            self.stop = []
        self.tokken_pattern = tokken_pattern
        self.lower = lower_case
        self.documents = sentences
        self.N = len(self.documents)
        #self.sentences = ' '.join(sentences)
        #self.all_words = re.findall(r'(?u)\b\w\w+\b', self.sentences) #Get all the words that satisfy the regular expression
        self.words = [] #List of all different words
        self.word2index = {} #Dictionary from word to his index
        self.count_word = []
        

        return


    def fit(self):
        for document in tqdm(self.documents):
            if self.lower:
                document = str(document).lower()
            all_words = re.findall(self.tokken_pattern, document)
            for word in all_words:
                if self.stemming:
                    word = self.st.stem(word)
                if self.lemmatization:
                    word = self.lemmatizer.lemmatize(word)
                if word not in self.stop:
                    if word in self.word2index.keys():
                        pass
                    else:
                        self.words.append(word)
                        self.word2index[word] = len(self.words) - 1
                        self.count_word.append(0)
            for word in set(all_words):
                if self.stemming:
                    word = self.st.stem(word)
                if self.lemmatization:
                    word = self.lemmatizer.lemmatize(word)
                if word not in self.stop:
                    self.count_word[self.word2index[word]] += 1
        self.idf = np.log(self.N / (1 + np.array(self.count_word)))
                    
        return
    
    def transform(self, sentences):
        row = []
        col = []
        data = []
        i = 0 #Defines number of document (row in sparse matrix)
        for document in tqdm(sentences):
            if self.lower:
                document = str(document).lower()
            all_words = re.findall(self.tokken_pattern, str(document))
            for word in set(all_words):
                if self.stemming:
                    word = self.st.stem(word)
                if self.lemmatization:
                    word = self.lemmatizer.lemmatize(word)
                if word not in self.stop:
                    if word in self.words:
                        row.append(i) #index representing number of the word
                        col.append(self.word2index[word]) #Column of the word (index)
                        data.append(all_words.count(word)) #Number of times word appears in sentence
            i += 1
        return csr_matrix((data, (row, col)), shape=(len(sentences), len(self.words))).multiply(csr_matrix(self.idf))


In [6]:
#Get train data as list of strings 
q1_list_train = cast_list_as_strings(list(train_df.question1))
q2_list_train = cast_list_as_strings(list(train_df.question2))
full_list_train = q1_list_train + q2_list_train

#Get validation data as list of strings
q1_list_val = cast_list_as_strings(list(val_df.question1))
q2_list_val = cast_list_as_strings(list(val_df.question2))

#Get test data as list of strings
q1_list_test = cast_list_as_strings(list(test_df.question1))
q2_list_test = cast_list_as_strings(list(test_df.question2))

#### Compute count vectorizer and TF-IDF vectors for questions

Count Vectorizer

In [7]:
CountVectorizer = count_vectorizer(full_list_train)
CountVectorizer.fit()
q1_train_count = CountVectorizer.transform(q1_list_train)
q2_train_count = CountVectorizer.transform(q2_list_train)
q1_val_count = CountVectorizer.transform(q1_list_val)
q2_val_count = CountVectorizer.transform(q2_list_val)
q1_test_count = CountVectorizer.transform(q1_list_test)
q2_test_count = CountVectorizer.transform(q2_list_test)

100%|██████████████████████████████████████████████████████████████████████| 583794/583794 [00:03<00:00, 161356.98it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:21<00:00, 3592.72it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:25<00:00, 3398.31it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3141.72it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3220.73it/s]
100%|██████████████████████████████████████████████████████████████████████████| 16172/16172 [00:05<00:00, 3067.53it/s]
100%|██████████████████████████████████████████████████████████████████████████| 16172/16172 [00:05<00:00, 3231.17it/s]


TF-IDF

In [8]:
TFIDF = tf_idf(full_list_train)
TFIDF.fit()
q1_train_tfidf = TFIDF.transform(q1_list_train)
q2_train_tfidf = TFIDF.transform(q2_list_train)
q1_val_tfidf = TFIDF.transform(q1_list_val)
q2_val_tfidf = TFIDF.transform(q2_list_val)
q1_test_tfidf = TFIDF.transform(q1_list_test)
q2_test_tfidf = TFIDF.transform(q2_list_test)

100%|██████████████████████████████████████████████████████████████████████| 583794/583794 [00:05<00:00, 108430.63it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:20<00:00, 3603.72it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:25<00:00, 3400.11it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3115.24it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3226.47it/s]
100%|██████████████████████████████████████████████████████████████████████████| 16172/16172 [00:05<00:00, 3064.88it/s]
100%|██████████████████████████████████████████████████████████████████████████| 16172/16172 [00:04<00:00, 3239.22it/s]


### Different operations merging vectors of questions 1 and 2 before introducing to the model

In [9]:
def stack_features(feat1, feat2):
    return scipy.sparse.hstack((feat1,feat2))

def difference(feat1, feat2):
    return feat1 - feat2

def similarity(feat1, feat2, dist = 'cosine'):
    if dist == 'cosine':
        dif = (feat1.multiply(feat2).sum(axis = 1).squeeze(-1))
        q1 = np.sqrt(feat1.multiply(feat1).sum(axis = 1)).squeeze(-1)
        q2 = np.sqrt(feat2.multiply(feat2).sum(axis = 1)).squeeze(-1)
        distances = np.multiply(dif, 1/(np.multiply(q1, q2) + 0.0001))
        return np.reshape(np.asarray(distances), (feat1.shape[0], 1))

def different_product(feat1, feat2):
    return -(feat1 != feat2).astype(int) + feat1.multiply(feat2)

### Train different methods using both feature vectors and using different merging operations and validate with validation data. We use logistic regression as a simple model to compare between options.

In [10]:
'''
Count Vectorizer
'''
print('---------------------------------\nMethods using Count Vectorizer features:\n---------------------------------')

print('**** \nStack features \n****')
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
features_train = stack_features(q1_train_count, q2_train_count)
features_val = stack_features(q1_val_count, q2_val_count)
logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))

print('**** \nDifference of features \n****')
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
features_train = difference(q1_train_count, q2_train_count)
features_val = difference(q1_val_count, q2_val_count)
logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))

print('**** \nSimilarity of features \n****')
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
features_train = similarity(q1_train_count, q2_train_count)
features_val = similarity(q1_val_count, q2_val_count)
logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))

print('**** \nDifferent/product features \n****')
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
features_train = different_product(q1_train_count, q2_train_count)
features_val = different_product(q1_val_count, q2_val_count)
logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))


'''
TF-IDF features
'''
print('---------------------------------\nMethods using TF-IDF features:\n---------------------------------')

print('**** \nStack features \n****')
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
features_train = stack_features(q1_train_tfidf, q2_train_tfidf)
features_val = stack_features(q1_val_tfidf, q2_val_tfidf)
logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))

print('**** \nDifference of features \n****')
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
features_train = difference(q1_train_tfidf, q2_train_tfidf)
features_val = difference(q1_val_tfidf, q2_val_tfidf)
logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))

print('**** \nSimilarity of features \n****')
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
features_train = similarity(q1_train_tfidf, q2_train_tfidf)
features_val = similarity(q1_val_tfidf, q2_val_tfidf)
logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))

print('**** \nDifferent/product features \n****')
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
features_train = different_product(q1_train_tfidf, q2_train_tfidf)
features_val = different_product(q1_val_tfidf, q2_val_tfidf)
logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))

---------------------------------
Methods using Count Vectorizer features:
---------------------------------
**** 
Stack features 
****
logloss train: 0.408969131655398
Accuracy train: 0.8139754776513634
logloss validation: 0.5176233053892005
Accuracy validation: 0.7489422638807525
**** 
Difference of features 
****
logloss train: 0.5869796722329774
Accuracy train: 0.6573346077554754
logloss validation: 0.6631064091985044
Accuracy validation: 0.5822430514873397
**** 
Similarity of features 
****
logloss train: 0.5909700864104208
Accuracy train: 0.6460292500436798
logloss validation: 0.5940441277862787
Accuracy validation: 0.6416064570721864
**** 
Different/product features 
****
logloss train: 0.3946830067355933
Accuracy train: 0.8159076660602884
logloss validation: 0.45767637962131996
Accuracy validation: 0.7715289982425307
---------------------------------
Methods using TF-IDF features:
---------------------------------
**** 
Stack features 
****




logloss train: 0.3246501498449055
Accuracy train: 0.8450378044310151
logloss validation: 0.682246978317021
Accuracy validation: 0.724598060274686
**** 
Difference of features 
****
logloss train: 0.5196533113418434
Accuracy train: 0.6654676135760217
logloss validation: 0.8576855995179238
Accuracy validation: 0.5389572349150556
**** 
Similarity of features 
****
logloss train: 0.5753137822574135
Accuracy train: 0.6580437620119426
logloss validation: 0.5815729735924393
Accuracy validation: 0.6501334374796589
**** 
Different/product features 
****
logloss train: 0.42034648870590574
Accuracy train: 0.7965960595689575
logloss validation: 0.7818021595869908
Accuracy validation: 0.7461433313805897




Model using 2 features: different_product function and adding cosine similarity between vectors

In [11]:
features_train = scipy.sparse.hstack((different_product(q1_train_count, q2_train_count),
                                      different_product(q1_train_tfidf, q2_train_tfidf),
                                      similarity(q1_train_count, q2_train_count),
                                      similarity(q1_train_tfidf, q2_train_tfidf)))
features_val = scipy.sparse.hstack((different_product(q1_val_count, q2_val_count),
                                    different_product(q1_val_tfidf, q2_val_tfidf),
                                    similarity(q1_val_count, q2_val_count),
                                    similarity(q1_val_tfidf, q2_val_tfidf)))

logistic.fit(features_train, train_df['is_duplicate'].values)

print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values,
                                                       logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values,
                                                            logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))



logloss train: 0.33373635479721747
Accuracy train: 0.8442190224634032
logloss validation: 0.5755475316897847
Accuracy validation: 0.7869556727201719


### Adding lemmatization

In [12]:
CountVectorizer = count_vectorizer(full_list_train, lemmatization = True)
CountVectorizer.fit()
q1_train_count = CountVectorizer.transform(q1_list_train)
q2_train_count = CountVectorizer.transform(q2_list_train)
q1_val_count = CountVectorizer.transform(q1_list_val)
q2_val_count = CountVectorizer.transform(q2_list_val)

100%|███████████████████████████████████████████████████████████████████████| 583794/583794 [00:22<00:00, 25553.43it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:18<00:00, 3708.72it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:22<00:00, 3533.66it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3299.51it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3357.28it/s]


In [13]:
TFIDF = tf_idf(full_list_train, lemmatization = True)
TFIDF.fit()
q1_train_tfidf = TFIDF.transform(q1_list_train)
q2_train_tfidf = TFIDF.transform(q2_list_train)
q1_val_tfidf = TFIDF.transform(q1_list_val)
q2_val_tfidf = TFIDF.transform(q2_list_val)

100%|███████████████████████████████████████████████████████████████████████| 583794/583794 [00:39<00:00, 14598.54it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:19<00:00, 3694.74it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:23<00:00, 3481.98it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:05<00:00, 3003.68it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3175.39it/s]


In [14]:
features_train = scipy.sparse.hstack((different_product(q1_train_count, q2_train_count),
                                      different_product(q1_train_tfidf, q2_train_tfidf),
                                      similarity(q1_train_count, q2_train_count),
                                      similarity(q1_train_tfidf, q2_train_tfidf)))
features_val = scipy.sparse.hstack((different_product(q1_val_count, q2_val_count),
                                    different_product(q1_val_tfidf, q2_val_tfidf),
                                    similarity(q1_val_count, q2_val_count),
                                    similarity(q1_val_tfidf, q2_val_tfidf)))

logistic.fit(features_train, train_df['is_duplicate'].values)

print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values,
                                                       logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values,
                                                            logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))



logloss train: 0.35576466955308245
Accuracy train: 0.8324888573709219
logloss validation: 0.5578075325907522
Accuracy validation: 0.7814879906268307


### Adding stemming

In [15]:
CountVectorizer = count_vectorizer(full_list_train, stemming = True)
CountVectorizer.fit()
q1_train_count = CountVectorizer.transform(q1_list_train)
q2_train_count = CountVectorizer.transform(q2_list_train)
q1_val_count = CountVectorizer.transform(q1_list_val)
q2_val_count = CountVectorizer.transform(q2_list_val)

100%|███████████████████████████████████████████████████████████████████████| 583794/583794 [00:56<00:00, 10357.42it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:24<00:00, 3465.64it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:24<00:00, 3463.96it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3265.95it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3400.76it/s]


In [16]:
TFIDF = tf_idf(full_list_train, stemming = True)
TFIDF.fit()
q1_train_tfidf = TFIDF.transform(q1_list_train)
q2_train_tfidf = TFIDF.transform(q2_list_train)
q1_val_tfidf = TFIDF.transform(q1_list_val)
q2_val_tfidf = TFIDF.transform(q2_list_val)

100%|████████████████████████████████████████████████████████████████████████| 583794/583794 [01:49<00:00, 5347.31it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:21<00:00, 3563.58it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:25<00:00, 3430.34it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3262.47it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3273.95it/s]


In [17]:
features_train = scipy.sparse.hstack((different_product(q1_train_count, q2_train_count),
                                      different_product(q1_train_tfidf, q2_train_tfidf),
                                      similarity(q1_train_count, q2_train_count),
                                      similarity(q1_train_tfidf, q2_train_tfidf)))
features_val = scipy.sparse.hstack((different_product(q1_val_count, q2_val_count),
                                    different_product(q1_val_tfidf, q2_val_tfidf),
                                    similarity(q1_val_count, q2_val_count),
                                    similarity(q1_val_tfidf, q2_val_tfidf)))

logistic.fit(features_train, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values,
                                                       logistic.predict_proba(features_train))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values,
                                                            logistic.predict_proba(features_val))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val)).mean()))



logloss train: 0.42634853384725335
Accuracy train: 0.7907412546206367
logloss validation: 0.5530994462151946
Accuracy validation: 0.7580550673696543


No improvements with stemming or lemmatization. We will try adding extra features now

### Extra features added

In [18]:
CountVectorizer = count_vectorizer(full_list_train)
CountVectorizer.fit()
q1_train_count = CountVectorizer.transform(q1_list_train)
q2_train_count = CountVectorizer.transform(q2_list_train)
q1_val_count = CountVectorizer.transform(q1_list_val)
q2_val_count = CountVectorizer.transform(q2_list_val)

100%|██████████████████████████████████████████████████████████████████████| 583794/583794 [00:03<00:00, 157981.84it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:30<00:00, 3233.61it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:38<00:00, 2968.55it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:05<00:00, 2804.49it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:05<00:00, 3014.13it/s]


In [19]:
TFIDF = tf_idf(full_list_train)
TFIDF.fit()
q1_train_tfidf = TFIDF.transform(q1_list_train)
q2_train_tfidf = TFIDF.transform(q2_list_train)
q1_val_tfidf = TFIDF.transform(q1_list_val)
q2_val_tfidf = TFIDF.transform(q2_list_val)

100%|██████████████████████████████████████████████████████████████████████| 583794/583794 [00:05<00:00, 102489.64it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:26<00:00, 3373.55it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:30<00:00, 3234.60it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:05<00:00, 2996.77it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:05<00:00, 3072.26it/s]


In [20]:
X_train = get_sim_feature(train_df['question1'].apply(lambda x: str(x).split(' ')),
                    train_df['question2'].apply(lambda x: str(x).split(' ')))
X_val = get_sim_feature(val_df['question1'].apply(lambda x: str(x).split(' ')),
                         val_df['question2'].apply(lambda x: str(x).split(' ')))

Stack with only product feature

In [23]:
features_train_2 = scipy.sparse.hstack((different_product(q1_train_count, q1_train_count),
                                      different_product(q1_train_tfidf, q1_train_tfidf),
                                      X_train))
features_val_2 = scipy.sparse.hstack((different_product(q1_val_count, q1_val_count),
                                    different_product(q1_val_tfidf, q1_val_tfidf),
                                    X_val))

logistic.fit(features_train_2, train_df['is_duplicate'].values)
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values,
                                                       logistic.predict_proba(features_train_2))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train_2)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values,
                                                            logistic.predict_proba(features_val_2))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val_2)).mean()))



logloss train: 0.3597107786575165
Accuracy train: 0.8298920509631822
logloss validation: 0.5058738592332671
Accuracy validation: 0.7696413460912582


Stack with product and cosine feature

In [24]:
features_train_3 = scipy.sparse.hstack((different_product(q1_train_count, q2_train_count),
                                      different_product(q1_train_tfidf, q2_train_tfidf),
                                      similarity(q1_train_count, q2_train_count),
                                      similarity(q1_train_tfidf, q2_train_tfidf),
                                      X_train))
features_val_3 = scipy.sparse.hstack((different_product(q1_val_count, q2_val_count),
                                    different_product(q1_val_tfidf, q2_val_tfidf),
                                    similarity(q1_val_count, q2_val_count),
                                    similarity(q1_val_tfidf, q2_val_tfidf),
                                    X_val))
logistic.fit(features_train_3, train_df['is_duplicate'].values)

print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values,
                                                       logistic.predict_proba(features_train_3))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == logistic.predict(features_train_3)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values,
                                                            logistic.predict_proba(features_val_3))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == logistic.predict(features_val_3)).mean()))



logloss train: 0.3650059595931236
Accuracy train: 0.8272267272359771
logloss validation: 0.4659019264362046
Accuracy validation: 0.7844171060339777


## Gradient Boosting

In [25]:
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
xgb = XGBClassifier()






XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
xgb.fit(features_train_2, train_df['is_duplicate'].values)
print('------------------------------------------------------------------\nMethods with Gradient Boosting + IDF/Count + Extra Features \n------------------------------------------------------------------')
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values,
                                                       xgb.predict_proba(features_train_2))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == xgb.predict(features_train_2)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values,
                                                            xgb.predict_proba(features_val_2))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == xgb.predict(features_val_2)).mean()))

------------------------------------------------------------------
Methods with Gradient Boosting + IDF/Count + Extra Features 
------------------------------------------------------------------
logloss train: 0.4308414600972943
Accuracy train: 0.7812207730809155
logloss validation: 0.4458644383511
Accuracy validation: 0.770812992254117


In [27]:
xgb.fit(features_train_3, train_df['is_duplicate'].values)
print('------------------------------------------------------------------\nMethods with Gradient Boosting + IDF/Count + Extra Features + Cosine\n------------------------------------------------------------------')

print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values,
                                                       xgb.predict_proba(features_train_3))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == xgb.predict(features_train_3)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values,
                                                            xgb.predict_proba(features_val_3))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == xgb.predict(features_val_3)).mean()))



------------------------------------------------------------------
Methods with Gradient Boosting + IDF/Count + Extra Features + Cosine
------------------------------------------------------------------
logloss train: 0.38396660680926903
Accuracy train: 0.8148764804023337
logloss validation: 0.4030727431468267
Accuracy validation: 0.7985419514417756


In [33]:
from sklearn.model_selection import GridSearchCV

estimator = XGBClassifier()

parameters = {
    'max_depth': range (2, 8, 4),
    'n_estimators': range(100, 180, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

grid_search = GridSearchCV(
    estimator = estimator,
    param_grid = parameters,
    scoring = 'accuracy',
    cv = 5,
    verbose=True
)

grid_search.fit(features_train_3, train_df['is_duplicate'].values)

print(grid_search.best_estimator_)

Best = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=8, min_child_weight=1, 
              monotone_constraints='()', n_estimators=140, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Best.fit(features_train_3, train_df['is_duplicate'].values)

Fitting 5 folds for each of 36 candidates, totalling 180 fits






















































































































































































































































































































































































































































































































































































































































































































































XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=8, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=140, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [40]:
aaa = XGBClassifier(grid_search.best_estimator_)



In [36]:
Best = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=8, min_child_weight=1, 
              monotone_constraints='()', n_estimators=140, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Best.fit(features_train_3, train_df['is_duplicate'].values)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=8, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=140, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [39]:
print('------------------------------------------------------------------\nMethods with Gradient Boosting + IDF/Count + Extra Features \n------------------------------------------------------------------')
print('logloss train: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values,
                                                       Best.predict_proba(features_train_3))))
print('Accuracy train: ' + str((train_df['is_duplicate'].values == Best.predict(features_train_3)).mean()))
print('logloss validation: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values,
                                                            Best.predict_proba(features_val_3))))
print('Accuracy validation: ' + str((val_df['is_duplicate'].values == Best.predict(features_val_3)).mean()))

------------------------------------------------------------------
Methods with Gradient Boosting + IDF/Count + Extra Features 
------------------------------------------------------------------
logloss train: 0.39159291854801986
Accuracy train: 0.8113992264394633
logloss validation: 0.4117537289317768
Accuracy validation: 0.7934648180693875


BETTER RESULTS WITH different_product, cosine + extra features

## Grid Search for GB + Logistic?

## AUC-ROC

## Final model

# A PARTIR D'AQUÍ MERDA

### Simple method with count vectorizer (should be implemented from scratch?)

In [18]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(full_list_train)

CountVectorizer()

In [19]:
def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
    
    ############### Begin exercise ###################
    # what is kaggle                  q1
    # What is the kaggle platform     q2
    X_q1 = count_vectorizer.transform(q1_casted)
    X_q2 = count_vectorizer.transform(q2_casted)    
    X_q1q2 = scipy.sparse.hstack((X_q1,X_q2))
    ############### End exercise ###################

    return X_q1q2

In [21]:
X_tr_q1q2 = get_features_from_df(train_df,count_vectorizer)
X_val_q1q2  = get_features_from_df(val_df, count_vectorizer)

In [22]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
logistic.fit(X_tr_q1q2, train_df['is_duplicate'].values)

LogisticRegression(random_state=123, solver='liblinear')

In [59]:
#Accuracy of the first simple model in validation set
print((logistic.predict(X_tr_q1q2) == train_df.is_duplicate.values).mean())
print((logistic.predict(X_val_q1q2) == val_df.is_duplicate.values).mean())

0.81397205178539
0.7490073553342446


### TF-IDF

In [32]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit_transform(full_list_train)

<583794x74825 sparse matrix of type '<class 'numpy.int64'>'
	with 5861249 stored elements in Compressed Sparse Row format>

In [233]:


class count_vectorizer:
    
    def __init__(self, sentences, tokken_pattern = r'(?u)\b\w\w+\b', lower_case = True, stop_words = False):
        
        
        if stop_words:
            self.stop = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "doing", "a", "an", "the", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "again", "further", "then", "once", "here", "there", "own", "same", "so", "than", "s", "t", "can", "will", "just", "don", "should", "now"]
        else:
            self.stop = []
        self.tokken_pattern = tokken_pattern
        self.lower = lower_case
        self.documents = sentences
        self.N = len(self.documents)
        #self.sentences = ' '.join(sentences)
        #self.all_words = re.findall(r'(?u)\b\w\w+\b', self.sentences) #Get all the words that satisfy the regular expression
        self.words = [] #List of all different words
        self.word2index = {} #Dictionary from word to his index
        
        return
    
    def fit(self):
        for document in tqdm(self.documents):
            if self.lower:
                document = str(document).lower()
            all_words = re.findall(self.tokken_pattern, document)
            for word in all_words:
                if word not in self.stop:
                    if word in self.word2index.keys():
                        pass
                    else:
                        self.words.append(word)
                        self.word2index[word] = len(self.words) - 1
                        self.count_word.append(0)                    
        return
    
    def transform(self, sentences):
        row = []
        col = []
        data = []
        i = 0 #Defines number of document (row in sparse matrix)
        for document in tqdm(sentences):
            if self.lower:
                document = str(document).lower()
            all_words = re.findall(self.tokken_pattern, str(document))
            for word in set(all_words):
                if word not in self.stop:
                    if word in self.words:
                        row.append(i) #index representing number of the word
                        col.append(self.word2index[word]) #Column of the word (index)
                        data.append(all_words.count(word)) #Number of times word appears in sentence
            i += 1
        return csr_matrix((data, (row, col)), shape=(len(sentences), len(self.words)))
    

class tf_idf:

    def __init__(self, sentences, tokken_pattern = r'(?u)\b\w\w+\b', lower_case = True, stop_words = False):
        
        
        if stop_words:
            self.stop = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "doing", "a", "an", "the", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "again", "further", "then", "once", "here", "there", "own", "same", "so", "than", "s", "t", "can", "will", "just", "don", "should", "now"]
        else:
            self.stop = []
        self.tokken_pattern = tokken_pattern
        self.lower = lower_case
        self.documents = sentences
        self.N = len(self.documents)
        #self.sentences = ' '.join(sentences)
        #self.all_words = re.findall(r'(?u)\b\w\w+\b', self.sentences) #Get all the words that satisfy the regular expression
        self.words = [] #List of all different words
        self.word2index = {} #Dictionary from word to his index
        self.count_word = []
        

        return


    def fit(self):
        for document in tqdm(self.documents):
            if self.lower:
                document = str(document).lower()
            all_words = re.findall(self.tokken_pattern, document)
            for word in all_words:
                if word not in self.stop:
                    if word in self.word2index.keys():
                        pass
                    else:
                        self.words.append(word)
                        self.word2index[word] = len(self.words) - 1
                        self.count_word.append(0)
            for word in set(all_words):
                if word not in self.stop:
                    self.count_word[self.word2index[word]] += 1
        self.idf = np.log(self.N / (1 + np.array(self.count_word)))
                    
        return
    
    def transform(self, sentences):
        row = []
        col = []
        data = []
        i = 0 #Defines number of document (row in sparse matrix)
        for document in tqdm(sentences):
            if self.lower:
                document = str(document).lower()
            all_words = re.findall(self.tokken_pattern, str(document))
            for word in set(all_words):
                if word not in self.stop:
                    if word in self.words:
                        row.append(i) #index representing number of the word
                        col.append(self.word2index[word]) #Column of the word (index)
                        data.append(all_words.count(word)) #Number of times word appears in sentence
            i += 1
        return csr_matrix((data, (row, col)), shape=(len(sentences), len(self.words))).multiply(csr_matrix(self.idf))
        
                
                    
                    
        

In [234]:
sentences_train1 = list(train_df.question1)
sentences_train2 = list(train_df.question2)

In [235]:
tf_idf_object = tf_idf(sentences, stop_words = True)
tf_idf_object.fit()
feat1 = tf_idf_object.transform(sentences_train1)
feat2 = tf_idf_object.transform(sentences_train2)
X_q1q2 = scipy.sparse.hstack((feat1,feat2))

100%|███████████████████████████████████████████████████████████████████████| 583794/583794 [00:12<00:00, 48124.18it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:16<00:00, 3807.52it/s]
100%|████████████████████████████████████████████████████████████████████████| 291897/291897 [01:24<00:00, 3470.40it/s]


In [239]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
logistic.fit(X_q1q2, train_df['is_duplicate'].values)
(logistic.predict(X_q1q2) == train_df['is_duplicate'].values).mean()
print('logloss: ' + str(sklearn.metrics.log_loss(train_df['is_duplicate'].values, logistic.predict_proba(X_q1q2))))
print('Accuracy: ' + str((train_df['is_duplicate'].values == logistic.predict(X_q1q2)).mean()))




logloss: 0.32466481096936267
Accuracy: 0.8444519813495857


In [240]:
feat1_val = tf_idf_object.transform(list(val_df.question1))
feat2_val = tf_idf_object.transform(list(val_df.question2))
feat_val = scipy.sparse.hstack((feat1_val, feat2_val))
print('logloss: ' + str(sklearn.metrics.log_loss(val_df['is_duplicate'].values, logistic.predict_proba(feat_val))))
print('Accuracy: ' + str((val_df['is_duplicate'].values == logistic.predict(feat_val)).mean()))

100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3123.06it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15363/15363 [00:04<00:00, 3261.04it/s]


logloss: 0.6980789653106279
Accuracy: 0.7219293106815076


In [171]:
feat1.multiply(csr_matrix(tf_idf_object.idf))

<291897x94740 sparse matrix of type '<class 'numpy.float64'>'
	with 73245 stored elements in Compressed Sparse Row format>

In [179]:
X_q1q2

<291897x189480 sparse matrix of type '<class 'numpy.float64'>'
	with 140880 stored elements in COOrdinate format>

In [46]:
b = re.search('(?u)\b\w\w+\b', a)

In [66]:
b = re.findall(r'(?u)\b\w\w+\b', a[0:1000])

In [70]:
a[0:105]

'Is Java or C++ or C the most popular language amongst startups for backend development? How do you conver'

In [71]:
len(b)

178

In [74]:
s = {'patata': 1}

In [75]:
s

{'patata': 1}

In [78]:
s['pata'] = 2

In [79]:
s

{'patata': 1, 'pata': 2}

In [91]:
j = ['as', 'asdf', 'as']

In [92]:
set(j)

{'as', 'asdf'}

In [142]:
for element in set(j):
    print(j.count(element))

2
1


In [223]:
X_q1q2

<291897x149650 sparse matrix of type '<class 'numpy.float64'>'
	with 5861249 stored elements in COOrdinate format>

In [224]:
X_tr_q1q2

<291897x149650 sparse matrix of type '<class 'numpy.int64'>'
	with 5861249 stored elements in COOrdinate format>

In [199]:
document = sentences[1]
print(document)
document = str(document).lower()
print(document)
all_words = re.findall(r'(?u)\b\w\w+\b', str(document))
print(all_words)
print(set(all_words))

How do you convert direct speech into reported speech and vice versa including all cases?
how do you convert direct speech into reported speech and vice versa including all cases?
['how', 'do', 'you', 'convert', 'direct', 'speech', 'into', 'reported', 'speech', 'and', 'vice', 'versa', 'including', 'all', 'cases']
{'how', 'speech', 'cases', 'reported', 'you', 'all', 'direct', 'convert', 'versa', 'vice', 'and', 'including', 'into', 'do'}


In [201]:
all_words.count('how')

1