# CSI 5386 - NATURAL LANGUAGE PROCESSING
## Final Project
Name: Sandeep Kumar Reddy Kadapa <br>
Student ID: 300154284 <br>
Email ID: skada089@uottawa.ca

Name: Geetika Sharma <br>
Student ID: 100993465 <br>
Email: gshar013@uottawa.ca

# Importing necessary libraries

In [1]:
from nltk.corpus import reuters
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings("ignore")

# Reuters-21578 dataset

In [2]:
print(reuters.readme())


      The Reuters-21578 benchmark corpus, ApteMod version

This is a publically available version of the well-known Reuters-21578
"ApteMod" corpus for text categorization.  It has been used in
publications like these:

 * Yiming Yang and X. Liu. "A re-examination of text categorization
   methods".  1999.  Proceedings of 22nd Annual International SIGIR.
   http://citeseer.nj.nec.com/yang99reexamination.html

 * Thorsten Joachims. "Text categorization with support vector
   machines: learning with many relevant features".  1998. Proceedings
   of ECML-98, 10th European Conference on Machine Learning.
   http://citeseer.nj.nec.com/joachims98text.html

ApteMod is a collection of 10,788 documents from the Reuters financial
newswire service, partitioned into a training set with 7769 documents
and a test set with 3019 documents.  The total size of the corpus is
about 43 MB.  It is also available for download from
http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html ,
which includ

In [3]:
#Getting training and testing data from nltk
train_documents, train_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])
test_documents, test_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])

In [4]:
#number of documents of training and testing
print("Number of training documents:", len(train_documents))
print("Number of testing documents:", len(test_documents))

Number of training documents: 7769
Number of testing documents: 3019


## Preprocessing the Reuters dataset
1. we removed spaces, digits, punctuation and special characters and considering only text data
2. Calculated the top categories and considered them for training and testing data
3. Preprocess categories since some of the categories are multilabel
4. Remove stop words from the data
5. Finally creating and saving word vectors for our new data.

In [5]:
#removing spaces, digits, punctuation and special characters, considering only text data
train_documents = [re.sub('\\/|\s+', ' ', re.sub('!|\\"|\\#|\\$|\\%|\\&|\\(|\\)|\\*|\\+|\\,|\\-|\\.|\\:|\\;|\\<|\\=|\\>|\\?|\\@|\\[|\\\\|\\]|\\^|\\_|\\`|\\{|\\||\\}|\\~|\\\t|\\\n|\'|\,|[0-9]', '', doc.lower())).strip() for doc in train_documents]
test_documents = [re.sub('\\/|\s+', ' ', re.sub('!|\\"|\\#|\\$|\\%|\\&|\\(|\\)|\\*|\\+|\\,|\\-|\\.|\\:|\\;|\\<|\\=|\\>|\\?|\\@|\\[|\\\\|\\]|\\^|\\_|\\`|\\{|\\||\\}|\\~|\\\t|\\\n|\'|\,|[0-9]', '', doc.lower())).strip() for doc in test_documents]

In [6]:
#getting the count of top categories containing highest samples in the dataset
from itertools import chain
pd.Series(list(chain.from_iterable(train_categories))+list(chain.from_iterable(test_categories))).value_counts().head(10)

earn        3964
acq         2369
money-fx     717
grain        582
crude        578
trade        485
interest     478
ship         286
wheat        283
corn         237
dtype: int64

In [7]:
# We are only considering top 10 categories containing high samples
categories = ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'ship', 'wheat', 'corn']

In [8]:
# Function to preprocess the categories, because some multi categories in those situations we are considering only the first category
def preprocess_categories(categories, data_categories):
    #creating empty lists to store the categories and their index
    categories_new = []
    categories_index = []
    
    #looping thorugh the categories
    for cat in data_categories:
        #initially setting the index and category
        index=False
        category=None
        #iterating through the sub categories
        for cat2 in cat[::-1]:
            #if the category is in the list of categories store the category and make the index=True
            if cat2 in categories:
                category=cat2
                index=True
            if not category:
                category=None
        #finally append the category and whether its index for considering
        categories_index.append(index)
        categories_new.append(category)
    return categories_index, categories_new

In [9]:
#from above we get the index to consider and categories
train_categories_index, train_categories_new = preprocess_categories(categories, train_categories)
test_categories_index, test_categories_new = preprocess_categories(categories, test_categories)

In [10]:
#take the documents and categories where index is True by filters like below
train_documents = np.array(train_documents, dtype='object')[train_categories_index]
train_categories = np.array(train_categories_new)[train_categories_index]
test_documents = np.array(test_documents, dtype='object')[test_categories_index]
test_categories = np.array(test_categories_new)[test_categories_index]

In [11]:
#Now print the number of documents in the new dataset for top 10 categories
print("Number of training documents:", len(train_documents))
print("Number of testing documents:", len(test_documents))

Number of training documents: 6489
Number of testing documents: 2545


In [12]:
# tokenize and remove stopwords
stop_words_list = stopwords.words('english')
def tokenize_remove_stopwords(data):
    # create a empty list
    data_new = []
    for doc in data:
        # iterate through the documents
        doc_new = []
        # tokenize the docuemnt
        tokenized_words = word_tokenize(doc)
        for word in tokenized_words:
            #check whether the word is in stop words if not consider it
            if not word in stop_words_list:
                doc_new.append(word)
        data_new.append(doc_new)
    #finally return the data that contains no stop words
    return data_new

In [13]:
#removing the stopwords from the training and testing data
train_documents = tokenize_remove_stopwords(train_documents)
test_documents = tokenize_remove_stopwords(test_documents)

## Creating word Vectors
We considered the following embeddings for our data
1. Glove_6B_300d
2. Glove_840B_300d
3. Google_News
4. FastText_SkipGram
5. FastText_CBOW
6. LexVec_CC_300d
7. LexVec_WN_300d
8. PDC
9. HDC
10. ConceptNet_Numberbatch 

In [14]:
#importing the genism functions for creating word vectors 
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

Glove word embeddings cannot be used directly so we need to first save them in word to vector format so we are using below function to do that

In [15]:
#glove_input_file = './Glove 840B 300d/glove.840B.300d.txt'
#word2vec_output_file = './Glove 840B 300d/glove.840B.300d.txt.word2vec'
#glove2word2vec(glove_input_file, word2vec_output_file)

In [16]:
#glove_input_file = './Glove 6B 300d/glove.6B.300d.txt'
#word2vec_output_file = './Glove 6B 300d/glove.6B.300d.txt.word2vec'
#glove2word2vec(glove_input_file, word2vec_output_file)

In [17]:
#embeddings path and thier names for further analysis and accessing
embedding_list_path = ['./Glove 6B 300d/glove.6B.300d.txt.word2vec', './Glove 840B 300d/glove.840B.300d.txt.word2vec', 
                       './Google News 300d/GoogleNews-vectors-negative300.bin', 
                       './FastText SkipGram//wiki.en.vec', './FastText CBOW/cc.en.300.vec', 
                       './LexVec CC 300d/lexvec.commoncrawl.300d.W+C.pos.neg3.vectors', './LexVec WN 300d/lexvec.enwiki+newscrawl.300d.W+C.pos.vectors', 
                       './PDC/wikicorp.201004-pdc-iter-20-alpha-0.05-window-10-dim-300-neg-10-subsample-0.0001.txt', 
                       './HDC/wikicorp.201004-hdc-iter-20-alpha-0.025-window-10-dim-300-neg-10-subsample-0.0001.txt',
                       './ConceptNet Numberbatch/numberbatch-en.txt']

embedding_names = ['Glove_6B_300d', 'Glove_840B_300d', 'Google_News', 'FastText_SkipGram', 'FastText_CBOW', 'LexVec_CC_300d', 'LexVec_WN_300d', 'PDC', 'HDC', 'ConceptNet_Numberbatch']

In [28]:
# create_word2vec_embeddings will save the embeddings that are transformed in given path
def create_word2vec_embeddings(embeddings, data, index, name):
    if name=='test':
        print('Create word vectors for testing data')
    else:
        print('Creating word vectors for training data')
    data_list_mean = []
    data_list_sum = []
    for doc in data:
        doc_list = []
        for word in doc:
            try:
                vector = embeddings[word]
                doc_list.append(vector)
            except:
                pass
        mean_vec = np.stack(doc_list, axis=1).mean(axis=1)
        sum_vec = np.stack(doc_list, axis=1).sum(axis=1)
        data_list_mean.append(mean_vec)
        data_list_sum.append(sum_vec)
    print(np.array(data_list_mean).shape)
    print(np.array(data_list_sum).shape)
    np.save(f'./embeddings/{name}_mean_'+embedding_names[index],np.array(data_list_mean))
    np.save(f'./embeddings/{name}_sum_'+embedding_names[index],np.array(data_list_sum))

In [29]:
#get_embeddings function will convert the tokenized words into their corresponding word vectors of given embedding data and also we will do both sum and mean of the entire document data
#since concating will give us 3 dimensional array and our machine learning algorithms won't work with 3 dimensional data.

def get_embeddings(train_docs, test_docs):
    #iterate through each embedding
    for index, path in enumerate(embedding_list_path):
        print(f'Loading embedding from {path}')
        
        if not 'Google' in path:
            embeddings = KeyedVectors.load_word2vec_format(path)
        else:
            embeddings = KeyedVectors.load_word2vec_format(path, binary=True)
        print('Loading complete')
        
        create_word2vec_embeddings(embeddings, train_docs, index, 'train')
        create_word2vec_embeddings(embeddings, test_docs, index, 'test')
        
        print(f'Completed Word vectors for {path} embedding','\n')

In [30]:
#get_embeddings(train_documents, test_documents)

Loading embedding from ./Glove 6B 300d/glove.6B.300d.txt.word2vec
Loading complete
Creating word vectors for training data
(6489, 300)
(6489, 300)
Create word vectors for testing data
(2545, 300)
(2545, 300)
Completed Word vectors for ./Glove 6B 300d/glove.6B.300d.txt.word2vec embedding 

Loading embedding from ./Glove 840B 300d/glove.840B.300d.txt.word2vec
Loading complete
Creating word vectors for training data
(6489, 300)
(6489, 300)
Create word vectors for testing data
(2545, 300)
(2545, 300)
Completed Word vectors for ./Glove 840B 300d/glove.840B.300d.txt.word2vec embedding 

Loading embedding from ./Google News 300d/GoogleNews-vectors-negative300.bin
Loading complete
Creating word vectors for training data
(6489, 300)
(6489, 300)
Create word vectors for testing data
(2545, 300)
(2545, 300)
Completed Word vectors for ./Google News 300d/GoogleNews-vectors-negative300.bin embedding 

Loading embedding from ./FastText SkipGram//wiki.en.vec
Loading complete
Creating word vectors for t

## Modelling and Results

In [31]:
#importing all necessary functions from s
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

#inherently multiclass
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier

# One-Vs-One
from sklearn.svm import SVC

#One-Vs-Rest
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score
from sklearn.preprocessing import LabelEncoder

In [32]:
#creating dictionary of algorithms we test
def create_model_dict():

    models = {'GaussianNB':GaussianNB(), 
              'BernoulliNB':BernoulliNB(), 
              'DecisionTree':DecisionTreeClassifier(), 
              'KNeighbors':KNeighborsClassifier(), 
              'MLPClassifier':MLPClassifier(hidden_layer_sizes=(100, 50, 10)), 
              'RandomForest':RandomForestClassifier(), 
              'ExtraTrees':ExtraTreesClassifier(), 
              'SVC':SVC(),
              'LogisticRegression':LogisticRegression()}
    
    return models

In [33]:
#converting the categorical values into numerical values using LabelEncoder from sklearn
LE = LabelEncoder()
y_train = LE.fit_transform(train_categories)
y_test = LE.transform(test_categories)

In [34]:
#get_data function will read and return the training and testing data of sum and mean of vectors we saved above
def get_data(name):
    #creating names of the files
    train_sum_file_name = './embeddings/train_sum_'+name+'.npy'
    train_mean_file_name = './embeddings/train_mean_'+name+'.npy'
    test_sum_file_name = './embeddings/test_sum_'+name+'.npy'
    test_mean_file_name = './embeddings/test_mean_'+name+'.npy'
    #loading saved numpy vectors
    X_train_sum = np.load(train_sum_file_name)
    X_train_mean = np.load(train_mean_file_name)
    X_test_sum = np.load(test_sum_file_name)
    X_test_mean = np.load(test_mean_file_name)
    return X_train_sum, X_train_mean, X_test_sum, X_test_mean

In [35]:
#get_model_scores will calculate the performance metrics of the model for each algorithm
def get_model_scores(X_train, y_train, X_test, y_test):
    #creating empty list to store the metric values
    all_metrics=[]
    #creating dictionary of algorithms for analysis
    models = create_model_dict()
    #calculating the performance for each model
    for model_name, model in models.items():
        #since not all algorithms cannot be run directly using same method so created appropriate wrapper function
        print(f'Runnning {model_name} algorithm')
        metrics = []
        if not model_name in ['SVC', 'GradientBoosting', 'LogisticRegression']:
            classifier = model
        elif model_name == 'SVC':
            classifier = OneVsOneClassifier(model)
        else:
            classifier = OneVsRestClassifier(model)
            
        classifier.fit(X_train, y_train)
        #calculating the accuracy, precision, f1_score, recall
        metrics.append(accuracy_score(classifier.predict(X_test), y_test))
        metrics.append(precision_score(classifier.predict(X_test), y_test, average='macro'))
        metrics.append(f1_score(classifier.predict(X_test), y_test, average='macro'))
        metrics.append(recall_score(classifier.predict(X_test), y_test, average='macro'))
        all_metrics.append(metrics)
        
    return all_metrics

In [36]:
#calculating the metrics for all combinations of Algorithms as well as embeddings
final_sum_models = []
final_mean_models = []
for embedding_name in embedding_names:
    print(f'Runnning {embedding_name} embedding')
    X_train_sum, X_train_mean, X_test_sum, X_test_mean = get_data(embedding_name)
    final_sum_models.append(get_model_scores(X_train_sum, y_train, X_test_sum, y_test))
    final_mean_models.append(get_model_scores(X_train_mean, y_train, X_test_mean, y_test))
    print(f'{embedding_name} completed', '\n')

Runnning Glove_6B_300d embedding
Runnning GaussianNB algorithm
Runnning BernoulliNB algorithm
Runnning DecisionTree algorithm
Runnning KNeighbors algorithm
Runnning MLPClassifier algorithm
Runnning RandomForest algorithm
Runnning ExtraTrees algorithm
Runnning SVC algorithm
Runnning LogisticRegression algorithm
Runnning GaussianNB algorithm
Runnning BernoulliNB algorithm
Runnning DecisionTree algorithm
Runnning KNeighbors algorithm
Runnning MLPClassifier algorithm
Runnning RandomForest algorithm
Runnning ExtraTrees algorithm
Runnning SVC algorithm
Runnning LogisticRegression algorithm
Glove_6B_300d completed 

Runnning Glove_840B_300d embedding
Runnning GaussianNB algorithm
Runnning BernoulliNB algorithm
Runnning DecisionTree algorithm
Runnning KNeighbors algorithm
Runnning MLPClassifier algorithm
Runnning RandomForest algorithm
Runnning ExtraTrees algorithm
Runnning SVC algorithm
Runnning LogisticRegression algorithm
Runnning GaussianNB algorithm
Runnning BernoulliNB algorithm
Runnning

In [37]:
df_sum_models = pd.concat([pd.DataFrame(np.array(final_sum_models)[i, :, :], columns=['Accuracy', 'Precision', 'F1_score', 'Recall'],
                                        index=list(create_model_dict().keys()))\
                           .reset_index().rename(columns={'index':'Algorithm'})\
                           .merge(pd.Series([embedding_names[i]]*9, name='Embedding'), 
                                  left_index=True, right_index=True) for i in range(10)], 
                          ignore_index=True)

In [38]:
df_sum_models.head(9)

Unnamed: 0,Algorithm,Accuracy,Precision,F1_score,Recall,Embedding
0,GaussianNB,0.640472,0.425184,0.422551,0.51891,Glove_6B_300d
1,BernoulliNB,0.866012,0.682701,0.648979,0.651998,Glove_6B_300d
2,DecisionTree,0.79057,0.526497,0.528317,0.537312,Glove_6B_300d
3,KNeighbors,0.899411,0.751334,0.759949,0.774177,Glove_6B_300d
4,MLPClassifier,0.9222,0.799785,0.805708,0.832047,Glove_6B_300d
5,RandomForest,0.910806,0.760101,0.778731,0.826032,Glove_6B_300d
6,ExtraTrees,0.905305,0.741856,0.767746,0.833739,Glove_6B_300d
7,SVC,0.920629,0.767278,0.781157,0.819783,Glove_6B_300d
8,LogisticRegression,0.921022,0.822466,0.821033,0.823424,Glove_6B_300d


In [39]:
df_sum_models['Mean_score'] = df_sum_models.mean(axis=1)
df_sum_models['Rank'] = df_sum_models['Mean_score'].rank(ascending=False)
df_sum_models = df_sum_models.sort_values(by='Rank').reset_index(drop=True)
df_sum_models = df_sum_models.loc[:, ['Embedding', 'Algorithm','Accuracy', 'Precision', 'F1_score', 'Recall', 'Mean_score', 'Rank']]

In [40]:
df_sum_models.to_csv('models_sum.csv', index=False)

In [41]:
df_mean_models = pd.concat([pd.DataFrame(np.array(final_mean_models)[i, :, :], columns=['Accuracy', 'Precision', 'F1_score', 'Recall'], 
                                         index=list(create_model_dict().keys()))\
                            .reset_index().rename(columns={'index':'Algorithm'})\
                            .merge(pd.Series([embedding_names[i]]*9, name='Embedding'), 
                                   left_index=True, right_index=True) for i in range(10)], 
                           ignore_index=True)

In [42]:
df_mean_models.head(9)

Unnamed: 0,Algorithm,Accuracy,Precision,F1_score,Recall,Embedding
0,GaussianNB,0.862083,0.758189,0.706449,0.705963,Glove_6B_300d
1,BernoulliNB,0.866012,0.682701,0.648979,0.651998,Glove_6B_300d
2,DecisionTree,0.808251,0.609814,0.606901,0.607175,Glove_6B_300d
3,KNeighbors,0.915128,0.770038,0.77984,0.807331,Glove_6B_300d
4,MLPClassifier,0.926523,0.819634,0.82572,0.846252,Glove_6B_300d
5,RandomForest,0.917878,0.777735,0.796195,0.838551,Glove_6B_300d
6,ExtraTrees,0.916306,0.77492,0.794646,0.835383,Glove_6B_300d
7,SVC,0.939489,0.842915,0.854771,0.878358,Glove_6B_300d
8,LogisticRegression,0.936346,0.839541,0.851763,0.877172,Glove_6B_300d


In [43]:
df_mean_models['Mean_score'] = df_mean_models.mean(axis=1)
df_mean_models['Rank'] = df_mean_models['Mean_score'].rank(ascending=False)
df_mean_models = df_mean_models.sort_values(by='Rank').reset_index(drop=True)
df_mean_models = df_mean_models.loc[:, ['Embedding', 'Algorithm','Accuracy', 'Precision', 'F1_score', 'Recall', 'Mean_score', 'Rank']]

In [44]:
df_mean_models.to_csv('models_mean.csv', index=False)