# Word2Vec for Text Classification
We will use the sentiment labelled sentences dataset from UCI repository http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

The dataset consists of 1500 positive, and 1500 negative sentiment sentences from Amazon, Yelp, IMDB.

In [10]:
#basic imports
import warnings
warnings.filterwarnings('ignore')
import os
import wget
import gzip
import shutil
from time import time

#pre-processing imports
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

#imports related to modeling
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /home/rachel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rachel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Download & load the pre-trained embedding model, we will use the Google News vectors:

In [28]:
path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'

if not os.path.exists(path_to_model):
    !mkdir DATAPATH
    !wget -P DATAPATH https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
    !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz

#Load W2V model. This will take some time. 
%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
print('done loading Word2Vec')

CPU times: user 9.31 s, sys: 32.9 s, total: 42.3 s
Wall time: 42.4 s
done loading Word2Vec


Download the data:

In [48]:
training_data_path = f"Data/sentiment_sentences.txt"
if not os.path.exists(training_data_path):
    github_prefix = 'https://github.com/practical-nlp/practical-nlp-code/blob/master/Ch4/Data/sentiment%20labelled%20sentences'
    !mkdir Data
    !wget -P Data {github_prefix}/amazon_cells_labelled.txt?raw=True
    !wget -P Data {github_prefix}/imdb_labelled.txt?raw=True
    !wget -P Data {github_prefix}/yelp_labelled.txt?raw=True
    file = open(fil, 'w')
    file.close()
    filenames = ['amazon_cells_labelled.txt?raw=True', 'imdb_labelled.txt?raw=True', 'yelp_labelled.txt?raw=True']
    with open(training_data_path, 'w') as outfile:
        for fname in filenames:
            with open(f'Data/{fname}') as infile:
                outfile.write(infile.read())
    print("File created")
else:
    print("File already exists")

File already exists


Load the data

In [62]:
#the file consists of tab separated sentences and categories {1: positive, 0:negative}
texts = []
cats = []
fh = open(training_data_path)
for line in fh:
    text, sentiment = line.split("\t")
    texts.append(text)
    cats.append(int(sentiment[:1]))
#Inspect the dataset
print(len(cats), len(texts))
list(zip(texts[:5], cats[:5]))

3000 3000


[('So there is no way for me to plug it in here in the US unless I go by a converter.',
  0),
 ('Good case, Excellent value.', 1),
 ('Great for the jawbone.', 1),
 ('Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
  0),
 ('The mic is great.', 1)]

In [63]:
#preprocess the text.
def preprocess_corpus(texts):
    mystopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()
               and token not in punctuation]
    #This return statement below uses the above function to process twitter tokenizer output further. 
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

texts_processed = preprocess_corpus(texts)
print(len(cats), len(texts_processed))
print(texts_processed[1])
print(cats[1])

3000 3000
['good', 'case', 'excellent', 'value']
1


In [64]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this =  np.zeros(DIMENSION)
        count_for_this = 0 + 1e-5 # to avoid divide-by-zero 
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this +=1
        if(count_for_this!=0):
            feats.append(feat_for_this/count_for_this) 
        else:
            feats.append(zero_vector)
    return feats


train_vectors = embedding_feats(texts_processed)
print(len(train_vectors))

3000


In [68]:
#Take any classifier (LogisticRegression here, and train/test it like before.
classifier = LogisticRegression(random_state=42)
train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)
classifier.fit(train_data, train_cats)
print("Accuracy: ", classifier.score(test_data, test_cats))
preds = classifier.predict(test_data)
print(classification_report(test_cats, preds))

Accuracy:  0.8266666666666667
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       392
           1       0.81      0.83      0.82       358

    accuracy                           0.83       750
   macro avg       0.83      0.83      0.83       750
weighted avg       0.83      0.83      0.83       750

