# Assignment 3 - Vectorizers
## Apply of Glove & Word2Vec Embeddings on dataset

### Import the necessary libraries

In [18]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors

### Download the data
Dataset: 
https://raw.githubusercontent.com/subashgandyer/datasets/main/deepnlp_classification_data.zip

### Load the train and test data

In [19]:
! wget https://raw.githubusercontent.com/subashgandyer/datasets/main/deepnlp_classification_data.zip

--2023-05-18 23:43:01--  https://raw.githubusercontent.com/subashgandyer/datasets/main/deepnlp_classification_data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8002::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1373119 (1.3M) [application/zip]
Saving to: 'deepnlp_classification_data.zip.2'

     0K .......... .......... .......... .......... ..........  3% 1.66M 1s
    50K .......... .......... .......... .......... ..........  7% 32.0M 0s
   100K .......... .......... .......... .......... .......... 11% 1.82M 0s
   150K .......... .......... .......... .......... .......... 14% 5.44M 0s
   200K .......... .......... .......... .......... .......... 18% 2.71M 0s
   250K .......... .......... .......... .......... .......... 22% 34.8M 0s
   300K .......... .......... .......... .......

In [20]:
import zipfile

zip_file_path = "deepnlp_classification_data.zip"

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents of the zip file
    zip_ref.extractall()

In [21]:
train_file = "r8-train-all-terms.txt"
with open(train_file, "r") as train_f:
    train_data = train_f.readlines()

test_file = "r8-test-all-terms.txt"
with open(test_file, "r") as test_f:
    test_data = test_f.readlines()

In [22]:
import pandas as pd
train = pd.read_csv('r8-train-all-terms.txt',header=None,sep='\t')
test = pd.read_csv('r8-test-all-terms.txt',header=None,sep='\t')

In [23]:
train.head()

Unnamed: 0,0,1
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [24]:
test.head()

Unnamed: 0,0,1
0,trade,asian exporters fear damage from u s japan rif...
1,grain,china daily says vermin eat pct grain stocks a...
2,ship,australian foreign ship ban ends but nsw ports...
3,acq,sumitomo bank aims at quick recovery from merg...
4,earn,amatil proposes two for five bonus share issue...


In [25]:
train.columns = ['word','related']
test.columns = ['word','related']

# 1. Glove Vectorizer

### Create a GloveVectorizer Class
- __init__
- transform
- fit_transform

In [26]:
class GloveVectorizer:
  def __init__(self):
    # load in pre-trained word vectors
    print('Loading word vectors from Glove...')
    word2vec = {}
    embedding = []
    idx2word = []
    with open('glove.6B.50d.txt') as f:
      # is just a space-separated text file in the format:
      # word vec[0] vec[1] vec[2] ...
      for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
    print('Found %s word vectors.' % len(word2vec))

    # save for later
    self.word2vec = word2vec
    self.embedding = np.array(embedding)
    self.word2idx = {v:k for k,v in enumerate(idx2word)}
    self.V, self.D = self.embedding.shape

  def fit(self, data):
    pass

  def transform(self, data):
    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.lower().split()
      vecs = []
      for word in tokens:
        if word in self.word2vec:
          vec = self.word2vec[word]
          vecs.append(vec)
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X

  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

### Create a Glove Vectorizer object

In [27]:
glove=GloveVectorizer()

Loading word vectors from Glove...
Found 400000 word vectors.


### Apply vectorization on Training and Test data

In [28]:
train.head()

Unnamed: 0,word,related
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [29]:
Xtrain = glove.fit_transform(train.related)
Ytrain = train.word

Xtest = glove.transform(test.related)
Ytest = test.word

Numer of samples with no words found: 0 / 5485
Numer of samples with no words found: 0 / 2189


### Create the model, train it, print scores

In [30]:
rf=RandomForestClassifier()
rf.fit(Xtrain,Ytrain)

rf.score(Xtrain,Ytrain)

0.9992707383773929

### Evaluate the model

In [31]:
rf.score(Xtest,Ytest)

0.9314755596162632

# 2. Word2Vec Vectorizer

### Google News Vector Model
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

## Create a Word2VecVectorizer Class
- __init__
- transform
- fit_transform

In [32]:
class Word2VecVectorizer:
  def __init__(self):
    print("Loading word vectors from Word2Vec...")
    self.word_vectors = KeyedVectors.load_word2vec_format(
      'GoogleNews-vectors-negative300.bin',
      binary=True
    )
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
    v = self.word_vectors.get_vector('king')
    self.D = v.shape[0]

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

### Create a Word2Vec Vectorizer object

In [33]:
w2v=Word2VecVectorizer()

Loading word vectors from Word2Vec...
Finished loading in word vectors


### Apply vectorization of training and test data

In [34]:
xtrain = w2v.fit_transform(train.related)
ytrain = train.word

xtest = w2v.transform(test.related)
ytest = test.word

Numer of samples with no words found: 0 / 5485
Numer of samples with no words found: 0 / 2189


### Create a model

In [35]:
rfinw2v=RandomForestClassifier()
rfinw2v.fit(xtrain,ytrain)

rfinw2v.score(xtrain,ytrain)

0.9992707383773929

### Evaluate the model

In [36]:
rfinw2v.score(xtest,ytest)

0.9387848332571951

### Insights
- Which Vectorizer is better for this dataset with RandomForest as Algorithm?
- Which is the best Vectorizer when trying out different algorithms like SVM, NB, Logistic Regression?


1. Word2Vec Vectorizer with GoogleNews-vectors-negative300 model for me is better for this dataset with randomforest algorithm. 

2. 

In [37]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [38]:
svcglove=SVC(kernel='rbf', class_weight='balanced')
svcglove.fit(Xtrain,Ytrain)

svcglove.score(Xtest,Ytest)

0.9396984924623115

In [39]:
knnglove = KNeighborsClassifier()
knnglove.fit(Xtrain,Ytrain)

knnglove.score(Xtest,Ytest)

0.9328460484239379

In [40]:
nbglove = GaussianNB()
nbglove.fit(Xtrain,Ytrain)

nbglove.score(Xtest,Ytest)

0.8720877112836912

In [41]:
svcw2v=SVC(kernel='rbf', class_weight='balanced')
svcw2v.fit(xtrain,ytrain)

svcw2v.score(xtest,ytest)

0.9671082686158063

In [42]:
knnw2v = KNeighborsClassifier()
knnw2v.fit(xtrain,ytrain)

knnw2v.score(xtest,ytest)

0.94792142530836

In [43]:
nbw2v = GaussianNB()
nbw2v.fit(xtrain,ytrain)

nbw2v.score(xtest,ytest)

0.8547281863864779

I tried Word2vec vectorizer and Glove vectorizer, with SVM, Gaussian Naive Bayes, and KNN algorithms. 

Word2vec with SVM performs the best for me, and Word2vec with Gaussian Naive Bayes performs the worst for me. 