# Word2Vec for Text Classification

Code notebook for TAHLR Working Group (Spring 2024) based on:  

- Vajjala, S., Majumder, B., Gupta, A., and Surana, H. 2020. *Practical Natural Language Processing: A Comprehensive Guide to Building Real-World NLP Systems*. Sebastopol, CA: O’Reilly Media.

More info on book here: https://www.oreilly.com/library/view/practical-natural-language/9781492054047/

**Overview:** In this short notebook, we will see an example of how to use a pre-trained Word2vec model for doing feature extraction and performing text classification.

In [1]:
# installs

# !pip install numpy==1.19.5
# !pip install pandas==1.1.5
# !pip install gensim==3.8.3
# !pip install wget==3.2
# !pip install nltk==3.5
# !pip install scikit-learn==0.21.3
# !pip install fasttext-wheel


In [2]:
#basic imports
import warnings
warnings.filterwarnings('ignore')
import os
import urllib.request
import gzip
import shutil
from time import time


#pre-processing imports
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

#imports related to modeling
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pjb311/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pjb311/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Data and Model

In [3]:
# Get files

path = 'data/4b'

if not os.path.exists(path):
    os.makedirs(path)

files = ["https://raw.githubusercontent.com/practical-nlp/practical-nlp-code/master/Ch4/Data/sentiment%20labelled%20sentences/amazon_cells_labelled.txt", "https://raw.githubusercontent.com/practical-nlp/practical-nlp-code/master/Ch4/Data/sentiment%20labelled%20sentences/imdb_labelled.txt", "https://raw.githubusercontent.com/practical-nlp/practical-nlp-code/master/Ch4/Data/sentiment%20labelled%20sentences/yelp_labelled.txt"]

for file in files:
    file_name = file.split("/")[-1]
    urllib.request.urlretrieve(file, f"{path}/{file_name}")

!cat data/4b/amazon_cells_labelled.txt data/4b/imdb_labelled.txt data/4b/yelp_labelled.txt > data/4b/sentiment_sentences.txt

In [4]:
# Get model
# Download "slim" version of model

model_url = "https://github.com/eyaler/word2vec-slim/raw/master/GoogleNews-vectors-negative300-SLIM.bin.gz"

if not os.path.exists('models/GoogleNews-vectors-negative300-SLIM.bin.gz'):
    !curl -L $model_url -o models/GoogleNews-vectors-negative300-SLIM.bin.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  263M  100  263M    0     0  6866k      0  0:00:39  0:00:39 --:--:-- 6731k 263M   63  166M    0     0  6082k      0  0:00:44  0:00:27  0:00:17 10.1M6  176M    0     0  6226k      0  0:00:43  0:00:28  0:00:15 10.2M0  6791k      0  0:00:39  0:00:36  0:00:03 8155k


In [5]:
# Load model

from gensim.models import KeyedVectors, Word2Vec

pretrainedpath = "models/GoogleNews-vectors-negative300-SLIM.bin.gz"
w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True)
print('done loading Word2Vec')

done loading Word2Vec


In [6]:
#Inspect the model

word2vec_vocab = w2v_model.key_to_index
word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]
print(len(word2vec_vocab))

299567


In [7]:
# Read data
#the file path consists of tab separated sentences and cats.

training_data_path = "data/4b/sentiment_sentences.txt"

texts = []
cats = []

with open(training_data_path) as f:
    lines = f.readlines()
    for line in lines:
        text, sentiment = line.split('\t')
        texts.append(text.strip())
        cats.append(int(sentiment.strip()))

cats = np.array(cats)

In [8]:
#Inspect the dataset

print(len(cats), len(texts))

3000 3000


In [9]:
texts[:5]

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
 'The mic is great.']

In [10]:
cats[:5]

array([0, 1, 1, 0, 1])

In [11]:
#preprocess the text.
def preprocess_corpus(texts):
    mystopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()
               and token not in punctuation]
    #This return statement below uses the above function to process twitter tokenizer output further. 
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

texts_processed = preprocess_corpus(texts)
print(len(cats), len(texts_processed))
print(texts_processed[1])
print(cats[1])

3000 3000
['good', 'case', 'excellent', 'value']
1


## Create w2v representations of texts

In [12]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this =  np.zeros(DIMENSION)
        count_for_this = 0 + 1e-5 # to avoid divide-by-zero 
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this +=1
        if(count_for_this!=0):
            feats.append(feat_for_this/count_for_this) 
        else:
            feats.append(zero_vector)
    return feats

# # Refactor from book
# def embedding_feats(list_of_lists):
#     feats = []
#     for tokens in list_of_lists:
#         if tokens:
#             feat_for_this= np.mean([w2v_model[token] for token in tokens if token in w2v_model], axis=0)
#         else:
#             feat_for_this = np.zeros(300)
#         feats.append(feat_for_this)
#     return feats

train_vectors = embedding_feats(texts_processed)
print(len(train_vectors))

3000


## Classification with w2v

In [13]:
#Take any classifier (LogisticRegression here, and train/test it like before.
classifier = LogisticRegression(random_state=1234)
train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)
classifier.fit(train_data, train_cats)
print("Accuracy: ", classifier.score(test_data, test_cats))
preds = classifier.predict(test_data)
print(classification_report(test_cats, preds))

Accuracy:  0.8186666666666667
              precision    recall  f1-score   support

           0       0.82      0.81      0.82       371
           1       0.82      0.83      0.82       379

    accuracy                           0.82       750
   macro avg       0.82      0.82      0.82       750
weighted avg       0.82      0.82      0.82       750



## Data and mode (fasttext experiment)

In [14]:
# Get files

files = ["https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz"]

for file in files:
    file_name = file.split("/")[-1]
    urllib.request.urlretrieve(file, f"{path}/{file_name}")

!tar -xvf data/4b/dbpedia_csv.tar.gz -C data/4b

x dbpedia_csv/
x dbpedia_csv/test.csv
x dbpedia_csv/classes.txt
x dbpedia_csv/train.csv
x dbpedia_csv/readme.txt


In [15]:
# Loading data

data_path = 'data/4b'

# Loading train data
train_file = data_path + '/dbpedia_csv/train.csv'
df = pd.read_csv(train_file, header=None, names=['class','name','description'])
# Loading test data
test_file = data_path + '/dbpedia_csv/test.csv'
df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])
# Data we have
print("Train:{} Test:{}".format(df.shape,df_test.shape))

Train:(560000, 3) Test:(70000, 3)


In [16]:
# Map classes

# Since we have no clue about the classes lets build one
# Mapping from class number to class name
class_dict={
            1:'Company',
            2:'EducationalInstitution',
            3:'Artist',
            4:'Athlete',
            5:'OfficeHolder',
            6:'MeanOfTransportation',
            7:'Building',
            8:'NaturalPlace',
            9:'Village',
            10:'Animal',
            11:'Plant',
            12:'Album',
            13:'Film',
            14:'WrittenWork'
        }

# Mapping the classes
df['class_name'] = df['class'].map(class_dict)
df.head()

Unnamed: 0,class,name,description,class_name
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...,Company
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...,Company
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...,Company
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...,Company
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...,Company


In [17]:
# Inspect data

df["class_name"].value_counts()

class_name
Company                   40000
EducationalInstitution    40000
Artist                    40000
Athlete                   40000
OfficeHolder              40000
MeanOfTransportation      40000
Building                  40000
NaturalPlace              40000
Village                   40000
Animal                    40000
Plant                     40000
Album                     40000
Film                      40000
WrittenWork               40000
Name: count, dtype: int64

In [18]:
# Lets do some cleaning of this text
def clean_it(text,normalize=True):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).replace(',',' ').replace('"','').replace('\'',' \' ').replace('.',' . ').replace('(',' ( ').\
            replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()
    
    # normalizing / encoding the text
    if normalize:
        s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')
    
    return s

# Now lets define a small function where we can use above cleaning on datasets
def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):
    # Defining the new data
    df = data[['name','description']].copy(deep=True)
    df['class'] = label_prefix + data['class'].astype(str) + ' '
    
    # cleaning it
    if cleanit:
        df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))
        df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))
    
    # shuffling it
    if shuffleit:
        df.sample(frac=1).reset_index(drop=True)
            
    return df

# Transform the datasets using the above clean functions
df_train_cleaned = clean_df(df, True, True)
df_test_cleaned = clean_df(df_test, True, True)

In [19]:
# Write files to disk as fastText classifier API reads files from disk.
train_file = data_path + '/dbpedia_train.csv'
df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )

test_file = data_path + '/dbpedia_test.csv'
df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )

## Run classifier using fasttext vectors

In [20]:
# Idea of subword vectors

def generate_char_ngrams(word, n):
    # Adding start and end markers
    padded_word = '<' + word + '>'
    ngrams = [padded_word[i:i+n] for i in range(len(padded_word)-n+1)]
    return ngrams

# Example usage
word = "amaverunt"
n = 3
ngrams = generate_char_ngrams(word, n)
print(ngrams)

['<am', 'ama', 'mav', 'ave', 'ver', 'eru', 'run', 'unt', 'nt>']


In [21]:
# %%time
# ## Using fastText for feature extraction and training
# from fasttext import train_supervised 
# """fastText expects and training file (csv), a model name as input arguments.
# label_prefix refers to the prefix before label string in the dataset.
# default is __label__. In our dataset, it is __class__. 
# There are several other parameters which can be seen in: 
# https://pypi.org/project/fasttext/
# """
# model = train_supervised(input=train_file, label="__class__", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)

In [22]:
# for k in range(1,6):
#     results = model.test(test_file,k=k)
#     print(f"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}")

## Use pretrained FastText vectors

In [23]:
import fasttext
import gensim

# Download the pre-trained FastText vectors for Latin
# You can find other languages here: https://fasttext.cc/docs/en/crawl-vectors.html
latin_vectors_url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz'
latin_vectors_path = 'models/cc.la.300.vec.gz'

# Download the vectors (if not already downloaded)
import requests
if not os.path.exists(latin_vectors_path):
    with requests.get(latin_vectors_url, stream=True) as r:
        with open(latin_vectors_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

# Load the vectors using Gensim
model = gensim.models.KeyedVectors.load_word2vec_format(latin_vectors_path)

In [24]:
# Function to find nearest neighbors
def get_nearest_neighbors(word, model, top_n=10):
    try:
        neighbors = model.most_similar(word, topn=top_n)
        return neighbors
    except KeyError:
        return f"The word '{word}' is not in the vocabulary."

# Example usage
word = 'pater'  # Replace with any Latin word
neighbors = get_nearest_neighbors(word, model)

print(f"Nearest neighbors for '{word}':")
for neighbor, similarity in neighbors:
    print(f"{neighbor}: {similarity:.4f}")

Nearest neighbors for 'pater':
pa-ter: 0.5668
avus: 0.5605
tahtll: 0.5577
filius: 0.5291
Épater: 0.4947
paterque: 0.4828
Proavus: 0.4797
mater: 0.4751
SENEX: 0.4745
avusque: 0.4713


In [25]:
# Download the FastText vectors for Latin if not already downloaded
latin_model_url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz'
latin_model_path = 'models/cc.la.300.bin.gz'
uncompressed_model_path = 'models/cc.la.300.bin'

# do the download with urllib

if not os.path.exists(latin_model_path):
    urllib.request.urlretrieve(latin_model_url, latin_model_path)

In [26]:
# Uncompress the model if not already done
if not os.path.exists(uncompressed_model_path):
    import gzip
    with gzip.open(latin_model_path, 'rb') as f_in:
        with open(uncompressed_model_path, 'wb') as f_out:
            f_out.write(f_in.read())


# Load the model
model = fasttext.load_model(uncompressed_model_path)

In [27]:
# Function to get subword vectors
def get_subword_vectors(word, model):
    subwords, indices = model.get_subwords(word)
    subword_vectors = model.get_input_matrix()[indices]
    return subwords, subword_vectors

# Example usage
word = 'amaverunt'
subwords, subword_vectors = get_subword_vectors(word, model)

print(f"Subwords for '{word}':")
for subword in subwords:
    print(subword)

print(f"\nSubword vectors for '{word}':")
for i, vec in enumerate(subword_vectors):
    print(f"Subword: {subwords[i]} -> Vector: {vec[:10]}...")  # Printing first 10 dimensions for brevity

Subwords for 'amaverunt':
amaverunt
<amav
amave
maver
averu
verun
erunt
runt>

Subword vectors for 'amaverunt':
Subword: amaverunt -> Vector: [-0.12557602 -0.20677628  0.01814291  0.10018203 -0.03906197 -0.02906827
  0.03644218 -0.05640594  0.05151154 -0.01040968]...
Subword: <amav -> Vector: [-0.02181956 -0.06025735 -0.08099414 -0.00137296 -0.10970218 -0.08324998
 -0.00574256  0.05039195 -0.03895114 -0.08521391]...
Subword: amave -> Vector: [-0.02245609 -0.02576638 -0.03162137  0.01544662 -0.00128898 -0.05441771
  0.01074362  0.01723444 -0.05633336 -0.06492849]...
Subword: maver -> Vector: [-0.08401632  0.02202878 -0.03314929  0.0347473  -0.0603159  -0.07957301
  0.01893586  0.00751092 -0.06143296 -0.07802811]...
Subword: averu -> Vector: [ 0.05511348 -0.02324122 -0.06766613  0.03605975  0.01640362 -0.03939509
 -0.01637885 -0.04223433 -0.00637189 -0.10575694]...
Subword: verun -> Vector: [-0.00134282  0.01997117  0.04394346 -0.08678149  0.08096965 -0.05902625
 -0.04757813 -0.07660703 