# Module 9: Word2Vec

* Grace Lyons 
* kat3ac 

# Set Up

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

data_dir = '/content/drive/My Drive/DS_5001/MOD9/'

Mounted at /content/drive


## Configuration

In [None]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
BAG = OHCO[:3] # Paragraphs
# BAG = OHCO[:4] # Sentences
window = 5

## Imports

In [None]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.manifold import TSNE
!pip install plotly_express
import plotly_express as px

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1


In [None]:
%matplotlib inline

# Process

## Import TOKENS and convert to a corpus for Gensim

We import data from the TOKEN table of the novels corpus, excluding proper nouns.

In [None]:
TOKENS = pd.read_csv(data_dir + 'TOKEN2.csv')

In [None]:
TOKENS.head()

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,term_id
0,158,1,1,0,0,"('Emma', 'NNP')",NNP,Emma,emma,11614
1,158,1,1,0,1,"('Woodhouse', 'NNP')",NNP,Woodhouse,woodhouse,39340
2,158,1,1,0,3,"('handsome', 'NN')",NN,handsome,handsome,15924
3,158,1,1,0,5,"('clever', 'NN')",NN,clever,clever,6354
4,158,1,1,0,7,"('and', 'CC')",CC,and,and,1426


In [None]:
LIB = pd.read_csv(data_dir + 'LIB.csv')
LIB.head()

Unnamed: 0,book_id,book_title,book_file,author,title
0,158,"Emma, by Jane Austen",epubs/AUSTEN_JANE_EMMA-pg158.txt,austen,Emma
1,946,"Lady Susan, by Jane Austen",epubs/AUSTEN_JANE_LADY_SUSAN-pg946.txt,austen,Lady Susan
2,1212,"Love And Freindship And Other Early Works, by ...",epubs/AUSTEN_JANE_LOVE_AND_FREINDSHIP_SIC_-pg1...,austen,Love And Freindship And Other Early Works
3,141,"Mansfield Park, by Jane Austen",epubs/AUSTEN_JANE_MANSFIELD_PARK-pg141.txt,austen,Mansfield Park
4,121,"Northanger Abbey, by Jane Austen",epubs/AUSTEN_JANE_NORTHANGER_ABBEY-pg121.txt,austen,Northanger Abbey


In [None]:
TOKENS = pd.merge(TOKENS, LIB, on = ['book_id'])\
                .drop(['book_title', 'book_file', 'title'], axis = 1)\
                .set_index(OHCO)

In [None]:
TOKENS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,term_id,author
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
158,1,1,0,0,"('Emma', 'NNP')",NNP,Emma,emma,11614,austen
158,1,1,0,1,"('Woodhouse', 'NNP')",NNP,Woodhouse,woodhouse,39340,austen
158,1,1,0,3,"('handsome', 'NN')",NN,handsome,handsome,15924,austen
158,1,1,0,5,"('clever', 'NN')",NN,clever,clever,6354,austen
158,1,1,0,7,"('and', 'CC')",CC,and,and,1426,austen


In [None]:
AUSTEN = TOKENS[TOKENS.author == 'austen']
AUSTEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,term_id,author
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
158,1,1,0,0,"('Emma', 'NNP')",NNP,Emma,emma,11614,austen
158,1,1,0,1,"('Woodhouse', 'NNP')",NNP,Woodhouse,woodhouse,39340,austen
158,1,1,0,3,"('handsome', 'NN')",NN,handsome,handsome,15924,austen
158,1,1,0,5,"('clever', 'NN')",NN,clever,clever,6354,austen
158,1,1,0,7,"('and', 'CC')",CC,and,and,1426,austen


In [None]:
MELVILLE = TOKENS[TOKENS.author == 'melville']
MELVILLE.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,term_id,author
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15422,1,0,0,0,"('THE', 'DT')",DT,THE,the,34777,melville
15422,1,0,0,1,"('BIRTHPLACE', 'NNP')",NNP,BIRTHPLACE,birthplace,3662,melville
15422,1,0,0,2,"('OF', 'NNP')",NNP,OF,of,23891,melville
15422,1,0,0,3,"('ISRAEL', 'NNP')",NNP,ISRAEL,israel,18985,melville
15422,1,1,0,0,"('The', 'DT')",DT,The,the,34777,melville


In [None]:
corpus_a = AUSTEN[~AUSTEN.pos.str.match('NNPS?')]\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()

In [None]:
corpus_m = MELVILLE[~MELVILLE.pos.str.match('NNPS?')]\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()

## Generate word embeddings with Gensim's library

In [None]:
model_a = word2vec.Word2Vec(corpus_a, size=246, window=window, min_count=200, workers=4)
model_m = word2vec.Word2Vec(corpus_m, size=246, window=window, min_count=200, workers=4)



## Visualize with tSNE

### Generate coordinates to plot

In [None]:
coords_a = pd.DataFrame(index=range(len(model_a.wv.vocab)))
coords_a['label'] = [w for w in model_a.wv.vocab]
coords_a['vector'] = coords_a['label'].apply(lambda x: model_a.wv.get_vector(x))

In [None]:
coords_a.head()

Unnamed: 0,label,vector
0,of,"[0.3226378, 0.10830974, 0.36938196, -0.3088802..."
1,in,"[0.34731096, -0.26975662, 0.6065102, -0.030056..."
2,was,"[0.44936335, -0.31799895, 0.6100802, 0.484167,..."
3,a,"[0.07964127, 0.07697755, -0.3564742, -0.183003..."
4,man,"[-0.7001643, -0.3426114, -1.1465791, 0.6497261..."


In [None]:
coords_m = pd.DataFrame(index=range(len(model_m.wv.vocab)))
coords_m['label'] = [w for w in model_m.wv.vocab]
coords_m['vector'] = coords_m['label'].apply(lambda x: model_m.wv.get_vector(x))

In [None]:
coords_m.head()

Unnamed: 0,label,vector
0,the,"[0.014728041, 0.2173266, 0.48286474, -0.256540..."
1,for,"[0.20142768, -0.50761616, -0.5495971, 0.110741..."
2,a,"[0.06193759, 0.097066, -0.040272556, -0.152646..."
3,of,"[-0.05318928, -0.12785444, 0.002418933, -0.001..."
4,s,"[0.016910266, -0.1649269, -0.31504172, 0.26374..."


### Use ScikitLearn's TSNE library

In [None]:
tsne_model_a = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_values_a = tsne_model_a.fit_transform(coords_a['vector'].tolist())



In [None]:
coords_a['x'] = tsne_values_a[:,0]
coords_a['y'] = tsne_values_a[:,1]

In [None]:
coords_a.head()

Unnamed: 0,label,vector,x,y
0,of,"[0.3226378, 0.10830974, 0.36938196, -0.3088802...",-1.066439,5.09806
1,in,"[0.34731096, -0.26975662, 0.6065102, -0.030056...",-1.3229,4.525282
2,was,"[0.44936335, -0.31799895, 0.6100802, 0.484167,...",8.254866,3.541362
3,a,"[0.07964127, 0.07697755, -0.3564742, -0.183003...",14.922655,1.434503
4,man,"[-0.7001643, -0.3426114, -1.1465791, 0.6497261...",-8.926122,3.544481


In [None]:
tsne_model_m = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_values_m = tsne_model_m.fit_transform(coords_m['vector'].tolist())



In [None]:
coords_m['x'] = tsne_values_m[:,0]
coords_m['y'] = tsne_values_m[:,1]

In [None]:
coords_m.head()

Unnamed: 0,label,vector,x,y
0,the,"[0.014728041, 0.2173266, 0.48286474, -0.256540...",5.3624,-6.849661
1,for,"[0.20142768, -0.50761616, -0.5495971, 0.110741...",-3.22886,2.338336
2,a,"[0.06193759, 0.097066, -0.040272556, -0.152646...",-1.34151,-6.300092
3,of,"[-0.05318928, -0.12785444, 0.002418933, -0.001...",-3.500493,-5.596595
4,s,"[0.016910266, -0.1649269, -0.31504172, 0.26374...",4.679792,-1.528852


### Plot the coordinates

In [None]:
px.scatter(coords_a, 'x', 'y', text='label', height=1000).update_traces(mode='text')

In this visualization for Austen's works, one region that has similar words is the positive region of the y axis. In this area there are a lot of words that are associated with people, like eyes, friend, and father. The second region that stands out is the clusters in the fourth quadrant. These are mostly nouns and verbs like looking, room, and passed. 

In [None]:
px.scatter(coords_m, 'x', 'y', text='label', height=1000).update_traces(mode='text')

In this visualization for the Melville works, one region that stands out is around the midpoint on the y axis in the negative region of the x axis. There seems to be a lot of uncertain words like sometimes, whatever, and perhaps. Another region that stands out is a small cluster of words in the first quadrant that are all time related like years, days, and hours. 

A few analogies between Melville's work and Austen's work is they both use very descriptive language; they both have a lot of personal language like me, her, and him; and lastly, they also both use a lot of indecisive language like seems, doubt, and might.