In [12]:
#import packages/libraries
import nltk
import pandas as pd
import numpy as np

#Used in POS tagging
import re

#Cleaning data packages
import string
from nltk.stem import WordNetLemmatizer
from string import digits

#Train/test split
from sklearn.model_selection import train_test_split

#Document-Term Matrix/TFIDF packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

#For sentiment analysis
from textblob import TextBlob

#Classifier Packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

#Performance Analysis Packages
from sklearn.metrics import classification_report, confusion_matrix
import json

In [13]:
#clean data by eliminating punctuation, numbers, stop words, etc

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

def txt_clean(txt):
    #remove punctuation
    txt = "".join([c for c in txt if c not in string.punctuation])
    #remove digits
    remove_digits = str.maketrans('', '', digits)
    txt = txt.translate(remove_digits)  
    
    #create word tokens
    tokens = re.split('\W+', txt)
    
    #remove stopwords
  #  stopword_remover = lambda y: ' '.join([word for word in y.split() if word not in (stopwords)])

  #  df_column =df_column.apply(stopword_remover)    
    
    
    #lemmatize words
    txt = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(txt)

def clean_data(df_column):
    #cleaner function txt_clean  (See definition above)
    cleaner = lambda x: txt_clean(x)

    #remove stopwords
    stopword_remover = lambda y: ' '.join([word for word in y.split() if word not in (stopwords)])

    df_column =df_column.apply(stopword_remover)
    df_column = df_column.apply(cleaner)

    return df_column


[nltk_data] Downloading package wordnet to /home/userina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# preprocessing
# especially for TFIDF matrix

def make_corpus(df):
    '''make corpus using unique document IDs
    to ensure documents are not double counted in tfidf
    Takes:
        df : dataframe of training data from which to extract
            unique documents from columns 3 and 4
    Returns:
        dataframe with unique documents'''
    known = []
    new_corpus = []
    i = 0
    while i < df.shape[0]:
        ID1 = df.iloc[i, 1]
        ID2 = df.iloc[i, 2]   
        if ID1 not in known:
            #add title1 text to corpus
            new_corpus.append(df.iloc[i, 3])
            #add ID to known
            known.append(df.iloc[i, 1])                  
        if ID2 not in known:
            #add title2 text to corpus
            new_corpus.append(df.iloc[i, 4])
            #add ID to known
            known.append(df.iloc[i, 2])                  
        i+=1
    
    new_corpus = pd.DataFrame(new_corpus)
    return new_corpus


#Create TFIDF vectors for further analysis

def tfidf_matrix(vectorizer, df):
    '''Create tfidf vector for title1 and title2 columns in dataframe
    Takes:
        vectorizer: TfidfVectorizer() object which has been fit with training corpus
        df: dataframe with title1 and title2 that need to be passed through tfidf
    Returns:
        ( t1, t2 ): tupple containing two tfidf column vectors for title1 and title2 data'''
    t1 = vectorizer.transform(list(df.title1_en))
    t2 = vectorizer.transform(list(df.title2_en))
    
    return (t1, t2)

In [15]:
#POS tagging (have not removed upper letter, punctuation, etc.)

def POS_tagging(to_tag):
    token_vect = []
    tagged_vect = []
    for i in range(len(to_tag)):
        token_vect.append(re.split('\W+', to_tag.iloc[i][3]))
    for j in range(len(token_vect)):
        tagged_vect.append(nltk.pos_tag(token_vect[j]))
    return tagged_vect

In [16]:
#Sentiment analysis to determine polarity of titles
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

In [17]:
#Cosine similarity function

def cosine_similarity(vector1, vector2):
    vec1 = np.array(vector1.toarray()[0])
    vec2 = np.array(vector2.toarray()[0])
    r = np.dot(vec1, vec2) / (np.sqrt(np.sum(vec1**2)) * np.sqrt(np.sum(vec2**2)))
    if pd.isna(r):
        return 0
    else:
        return r

def cosine_similarity_vectors(t1,t2):
    new_X =[]
    i = 0
    for i in range(t1.shape[0]):
        new_X.append(cosine_similarity(t1[i], t2[i]))
    return new_X

In [18]:
#Validation functions
def validation(vectorizer,xvalidate,clf):
    t1, t2 = tfidf_matrix(vectorizer, xvalidate)
    validX =cosine_similarity_vectors(t1,t2)
    x = pd.DataFrame(validX)
    return clf.predict(x)

def validationNewModel(vectorizer,xvalidate,clf):
    t1, t2 = tfidf_matrix(vectorizer, xvalidate)
    validX =cosine_similarity_vectors(t1,t2)
    x = pd.DataFrame(validX)
    x['t1_polarity'] = xvalidate.title1_en.apply(getPolarity)
    x['t1_subjectivity'] = xvalidate.title1_en.apply(getSubjectivity)

    x['t2_polarity'] = xvalidate.title2_en.apply(getPolarity)
    x['t2_subjectivity'] = xvalidate.title2_en.apply(getSubjectivity)

    x['t1_polarity'] = x['t1_polarity'].fillna(0)
    x['t1_subjectivity'] = x['t1_subjectivity'].fillna(0)
    x['t2_polarity'] = x['t2_polarity'].fillna(0)
    x['t2_subjectivity'] = x['t2_subjectivity'].fillna(0)
    return clf.predict(x)

In [19]:
##########################################################################################
##                            D A T A   I M P O R T   
########################################################################################## 

#import data and remove upper case letters as part of data cleaning

train_original = pd.read_csv("train.csv", dtype=str).apply(lambda x: x.astype(str).str.lower())
test_original = pd.read_csv("test.csv", dtype=str).apply(lambda x: x.astype(str).str.lower())

#create second copy of data to clean so it can be compared to original copy to ensure proper cleaning
train_clean = train_original
test_clean = test_original

In [31]:
test_clean.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en
0,256442,100672,100673,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milking huo jianhua se...
1,256443,162269,162270,nasa reveals fact ufo wreckage found moon,ufo found yuancun jiaocheng county shanxi shoc...
2,256444,157826,157854,hollow tomato loaded hormone,li chenfan bingbing home photo netizen called ...
3,256445,109579,74076,ange pavilion geoshui accurate matrimony match...,master one eightcharacter presumption marriage...
4,256446,15068,15085,yearold busbus blow yearold child rumor rumorm...,joe johnson disgruntled timing order myth


In [20]:
##########################################################################################
##                         D A T A   C L E A N I N G
########################################################################################## 
# Update train_clean and test_clean dataframe by removing stopwords and cleaning text 
# for columns 3 and 4 by using clean_data() which uses txt_clean()

train_clean.title1_en = clean_data(train_clean.title1_en)
train_clean.title2_en = clean_data(train_clean.title2_en)

test_clean.title1_en = clean_data(test_clean.title1_en)
test_clean.title2_en = clean_data(test_clean.title2_en)


In [21]:
#Free up memory
del train_original
del test_original



In [22]:
##########################################################################################
##                         P R E P R O C E S S I N G
########################################################################################## 

#Reserve portion of training data for validation
#Randomly Split train_clean into train (70%) and validate (30%)
X_train = train_clean.iloc[:, :-1]
y_train = train_clean.iloc[:, -1]
#X_train,X_validate,y_train,y_validate = train_test_split(X, y, test_size=0.3, train_size=0.7)

#using all of the training data provided this time


In [24]:
# Create corpus for TFIDF
#This cell takes about 9.5 minutes to run
new_corpus = make_corpus(train_clean)

# Create vectorizer for tfidf using TfidfVectorizer and then fit the vectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(new_corpus[0])

#Create TFIDF vectors t1 and t2 using tfidf_matrix() (see above)
t1, t2 = tfidf_matrix(vectorizer, X_train)

In [25]:
##########################################################################################
##                         M O D E L    T R A I N I N G
########################################################################################## 

#Compute cosine similarity
#Takes about 1.75 minutes to run
newX = cosine_similarity_vectors(t1,t2)

x = pd.DataFrame(newX)


#Sentiment analysis to determine polarity of titles
x['t1_polarity'] = train_clean.title1_en.apply(getPolarity)
x['t1_subjectivity'] = train_clean.title1_en.apply(getSubjectivity)

x['t2_polarity'] = train_clean.title2_en.apply(getPolarity)
x['t2_subjectivity'] = train_clean.title2_en.apply(getSubjectivity)

  


In [26]:
##### C R E A T E   M O D E L S #####

# Create and train/fit Balanced Multinomial Logistic Regression  model
logistic_regressionMN_Balanced_clf = LogisticRegression(class_weight= 'balanced',multi_class="multinomial")
logistic_regressionMN_Balanced_clf.fit(x,y_train)



LogisticRegression(class_weight='balanced', multi_class='multinomial')

In [27]:
####### P R E D I C T I O N S    F O R    T E S T    D A T A ######

y_pred_logRegMN_Balanced = validationNewModel(vectorizer,test_clean, logistic_regressionMN_Balanced_clf)


  


In [39]:
predictions = pd.DataFrame(test_clean.id)
predictions['label'] = y_pred_logRegMN_Balanced

In [40]:
predictions.head()

Unnamed: 0,id,label
0,256442,unrelated
1,256443,disagreed
2,256444,unrelated
3,256445,unrelated
4,256446,unrelated


In [41]:
predictions.to_csv("submission.csv",index=False)