# Code for SwDS dissertation 2
#### This file contains code for the baseline models (Section 2)

#### Yilun Dong (s1994256), August/2020

In [1]:
import re
import string
import nltk
import scipy
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# original data frames from the csv file
train_original = pd.read_csv('quora_train.csv')
test_original = pd.read_csv('quora_test.csv')

In [3]:
# drop nan rows
train = train_original.drop([train_original.index[182601] , train_original.index[219751]])
test = test_original.drop([test_original.index[1495]])

print('The length of the training data is', len(train))
print('The length of the test data is', len(test))

The length of the training data is 323478
The length of the test data is 80870


In [4]:
# drop question ID and pairing ID
train = train.drop(train_original.columns[[0,1,2,3]], axis=1)
test = test.drop(test_original.columns[[0,1,2,3]], axis=1)

# Text Pre-processing

In [5]:
# define punctuation, stemmer and stop words
punctuation = string.punctuation
stemmer_sb = nltk.stem.SnowballStemmer('english')
stop_words = set(nltk.corpus.stopwords.words('english'))

def replaceNumbers(text):
    '''
    This function replaces numbers with words with space. For example, `100` would be replaced by `one zero zero`.
    '''
    scheme = {
        '0': 'zero',
        '1': 'one',
        '2': 'two',
        '3': 'three',
        '4': 'four',
        '5': 'five',
        '6': 'six',
        '7': 'seven',
        '8': 'eight',
        '9': 'nine'
    }
    for character, replacement in scheme.items():
        text = text.replace(character, ' ' + replacement + ' ')
    return text

def textCleaner(text):
    '''
    This function:
    1. substitutes punctuation defined above with blank space;
    2. strips the text and convert the letters to lower case;
    3. extracts the stems of the words in the text using stemmer defined above if `stemming = True`;
    4. removes the stop words defined above.
    '''
    text = re.sub(r'[{}]+'.format(punctuation), ' ' , text)
    text = text.strip().lower()
    text = replaceNumbers(text)
    word_list = nltk.word_tokenize(text)
    word_list = list(map(stemmer_sb.stem, word_list))
    word_list = [word for word in word_list if not word in stop_words]
    new_text = ' '.join(word_list)
    return new_text

In [6]:
train['question1'] = train['question1'].apply(textCleaner)
train['question2'] = train['question2'].apply(textCleaner)

train

Unnamed: 0,question1,question2,is_duplicate
0,doe ban five zero zero one zero zero zero rupe...,ban five zero zero one zero zero zero rupe not...,1
1,hardest thing rais children georgia,hardest thing rais children mexico,0
2,utopia top competitor,axi four one top competitor,0
3,want improv read skill read english news everi...,read newspap help improv english,0
4,gain weight natur way,gain weight,1
...,...,...,...
323475,win trump clinton,indict first trump clinton,0
323476,best earphon one zero zero zero rs,best earphon one zero zero zero rs,1
323477,song make cri whi,song make cri ever,1
323478,tast sperm,tast sperm,1


In [7]:
test['question1'] = test['question1'].apply(textCleaner)
test['question2'] = test['question2'].apply(textCleaner)

test

Unnamed: 0,question1,question2,is_duplicate
0,pros con legalzoom generat,read review legalzoom,0
1,whi doe readi eat poha absorb water instant,ultim teen patti hacker,0
2,app like paytm earn profit give mani cash back...,doe paytm earn give extra cash back alreadi di...,1
3,daili habit great upgrad life,daili habit improv product creativ,1
4,video game world would want live,could live ani video game set would live whi,1
...,...,...,...
80866,favourit beatl song whi like,favorit beatl song whi like,1
80867,legal author rule china claim eight five south...,prove everi bound monoton increas sequenc conv...,0
80868,best simul game android,best simul game android,1
80869,peopl still believ world flat,whi peopl current believ earth flat,1


In [8]:
labels_train = np.asarray(train['is_duplicate'].to_list())
labels_test = np.asarray(test['is_duplicate'].to_list())

In [9]:
# sample of the comparison between processed and original questions
print(train_original.loc[[100]])
print(train.loc[[100]])

     Unnamed: 0     id   qid1   qid2  \
100       16973  16973  33875  33876   

                                        question1  \
100  How many submarines should Indian Navy have?   

                                             question2  is_duplicate  
100  What should i join? Indian Navy or Merchant Navy?             0  
                     question1                       question2  is_duplicate
100  mani submarin indian navi  join indian navi merchant navi             0


# Basic bag-of-words

In [17]:
print('Preparing data...')

bow = CountVectorizer(analyzer = 'word', token_pattern = r'\w{1,}')
bow.fit(train['question1'].append(train['question2']))

x1_train_bow = bow.transform(train['question1'].values)
x2_train_bow = bow.transform(train['question2'].values)
x_train_bow = scipy.sparse.hstack((x1_train_bow, x2_train_bow))
y_train_bow = labels_train

x1_test_bow = bow.transform(test['question1'].values)
x2_test_bow = bow.transform(test['question2'].values)
x_test_bow = scipy.sparse.hstack((x1_test_bow, x2_test_bow))
y_test_bow = labels_test

print('Fitting the model...')

np.random.seed(1)
model_bow = xgb.XGBClassifier(max_depth = 50, n_estimators = 100, random_state = 1).fit(x_train_bow, y_train_bow) 

pred_bow = model_bow.predict(x_test_bow)
print(classification_report(y_test_bow, pred_bow))
print(confusion_matrix(y_test_bow, pred_bow))

Preparing data...
Fitting the model...
              precision    recall  f1-score   support

           0       0.77      0.93      0.84     51191
           1       0.82      0.53      0.64     29679

    accuracy                           0.78     80870
   macro avg       0.80      0.73      0.74     80870
weighted avg       0.79      0.78      0.77     80870

[[47705  3486]
 [14021 15658]]


# TF-IDF with bag-of-2-grams

In [20]:
print('Preparing data...')

ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), max_features = 10000)
ngram.fit(train['question1'].append(train['question2']))

x1_train_ngram = ngram.transform(train['question1'].values)
x2_train_ngram = ngram.transform(train['question2'].values)
x_train_ngram = scipy.sparse.hstack((x1_train_ngram, x2_train_ngram))
y_train_ngram = labels_train

x1_test_ngram = ngram.transform(test['question1'].values)
x2_test_ngram = ngram.transform(test['question2'].values)
x_test_ngram = scipy.sparse.hstack((x1_test_ngram, x2_test_ngram))
y_test_ngram = labels_test
   
print('Fitting the model...')

np.random.seed(1)
model_ngram = xgb.XGBClassifier(max_depth=50, n_estimators=100, random_state = 1).fit(x_train_ngram, y_train_ngram) 

pred_ngram = model_ngram.predict(x_test_ngram)
print(classification_report(y_test_ngram, pred_ngram))
print(confusion_matrix(y_test_ngram, pred_ngram))

Preparing data...
Fitting the model...
              precision    recall  f1-score   support

           0       0.78      0.93      0.85     51191
           1       0.83      0.53      0.65     29679

    accuracy                           0.79     80870
   macro avg       0.80      0.73      0.75     80870
weighted avg       0.79      0.79      0.77     80870

[[47861  3330]
 [13865 15814]]
