### Vector Creation using Gensim and word2vec

In [1]:
# Eman Mozaffar
# Tutorial used: https://www.youtube.com/watch?v=Q2NtCcqmIww
# Run these lines if you don't have these installed yet

#!pip install gensim
#!pip install python-Levenshtein

[0mCollecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/63/46/5feab9c524a380bfa9f9f1c0d065743280dca30b216ab4c7a231f22dbed7/gensim-4.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading gensim-4.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.3 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Obtaining dependency information for smart-open>=1.8.1 from https://files.pythonhosted.org/packages/fc/d9/d97f1db64b09278aba64e8c81b5d322d436132df5741c518f3823824fae0/smart_open-6.4.0-py3-none-any.whl.metadata
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Downloading gensim-4.3.2-cp311-cp311-macosx_11_0_arm64.whl (24.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading smart_open-6.4.0-py3-none-any.whl (57 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [2]:
import gensim
import pandas as pd

In [3]:
# Import data file containing movie lines
datafile = "moviedata.csv"
movies = pd.read_csv(datafile)
movies.head()

Unnamed: 0,m0,line#,char#,charName,text,movieName,year,rating,IMDBvotes#,genre
0,m1,L2181,u12,ALONSO,Can't be that far I say. Also I don't like th...,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
1,m1,L2180,u23,SAILOR,We left three weeks ago Alonso. Can't be that ...,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
2,m1,L2179,u12,ALONSO,We should have seen land.,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
3,m1,L2177,u23,SAILOR,We'll all go crazy...,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
4,m1,L2176,u12,ALONSO,He's the devil's child...,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']


In [4]:
movies.shape

(287496, 10)

In [6]:
# Preprocess the text to get rid of trailing spaces, punctuation, stopwords, etc.
# We only care about the text column for word2vec purposes, so we tokenize this
movie_text = movies.text.apply(gensim.utils.simple_preprocess)
movie_text.head()

0    [can, be, that, far, say, also, don, like, the...
1    [we, left, three, weeks, ago, alonso, can, be,...
2                       [we, should, have, seen, land]
3                             [we, ll, all, go, crazy]
4                              [he, the, devil, child]
Name: text, dtype: object

#### Training a basic model on all data

In [7]:
# Initialize gensim model
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
    )

In [8]:
# Build vocabulary
model.build_vocab(movie_text, progress_per=1000)

In [9]:
model.epochs

5

In [17]:
# Train the model
model.train(
    movie_text, 
    total_examples=model.corpus_count, 
    epochs=5
)

(10235556, 13968880)

In [18]:
# Save the model
model.save("./word2vec-movie-text.model")

#### Some simple examples of using the model

In [20]:
model.wv.most_similar("bad")

[('good', 0.6519631147384644),
 ('stupid', 0.6370585560798645),
 ('weird', 0.6032191514968872),
 ('rough', 0.5985896587371826),
 ('funny', 0.5981307625770569),
 ('smart', 0.5398736596107483),
 ('drunk', 0.5269859433174133),
 ('hard', 0.5223352909088135),
 ('tough', 0.5215043425559998),
 ('dumb', 0.5177732110023499)]

In [25]:
model.wv.similarity(w1="woman", w2="strong")

0.17790651

In [26]:
model.wv.similarity(w1="woman", w2="weak")

0.22163638

In [27]:
model.wv.similarity(w1="woman", w2="home")

0.027020589

In [28]:
model.wv.similarity(w1="woman", w2="work")

0.024333421

#### Training different models based on decade

In [29]:
movies['text'] = movies.text.apply(gensim.utils.simple_preprocess)
movies.head()

Unnamed: 0,m0,line#,char#,charName,text,movieName,year,rating,IMDBvotes#,genre
0,m1,L2181,u12,ALONSO,"[can, be, that, far, say, also, don, like, the...",1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
1,m1,L2180,u23,SAILOR,"[we, left, three, weeks, ago, alonso, can, be,...",1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
2,m1,L2179,u12,ALONSO,"[we, should, have, seen, land]",1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
3,m1,L2177,u23,SAILOR,"[we, ll, all, go, crazy]",1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
4,m1,L2176,u12,ALONSO,"[he, the, devil, child]",1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']


In [32]:
# Turn year into an int so we can filter 
movies['year'] = movies['year'].str.replace('/I', '').astype(int)
movies['year'] = movies['year'].astype(int)
movies.dtypes

m0             object
line#          object
char#          object
charName       object
text           object
movieName      object
year            int64
rating        float64
IMDBvotes#    float64
genre          object
dtype: object

In [34]:
movies['year'].unique()

array([1992, 1997, 1987, 1999, 2003, 1934, 1991, 1990, 2007, 1993, 1995,
       1983, 1994, 1998, 2001, 1985, 1982, 2000, 1976, 1937, 2002, 2004,
       1980, 1955, 1970, 1941, 1927, 1939, 1988, 1936, 1984, 1986, 1989,
       1931, 1954, 1981, 1949, 1943, 1975, 1932, 2010, 2006, 1977, 1953,
       1996, 1972, 1963, 1945, 2008, 2005, 1960, 1964, 1979, 1950, 1973,
       1966, 1959, 1974, 1968, 1971, 1967, 1944, 1978, 1940, 1946, 1942,
       1969, 1965, 1933, 1957, 2009, 1958, 1961])

In [45]:
# Separate out all of the values we want
movies_1921_1930 = movies[(movies['year'] >= 1921) & (movies['year'] <= 1930)]
movies_1931_1940 = movies[(movies['year'] >= 1931) & (movies['year'] <= 1940)]
movies_1941_1950 = movies[(movies['year'] >= 1941) & (movies['year'] <= 1950)]
movies_1951_1960 = movies[(movies['year'] >= 1951) & (movies['year'] <= 1960)]
movies_1961_1970 = movies[(movies['year'] >= 1961) & (movies['year'] <= 1970)]
movies_1971_1980 = movies[(movies['year'] >= 1971) & (movies['year'] <= 1980)]
movies_1981_1990 = movies[(movies['year'] >= 1981) & (movies['year'] <= 1990)]
movies_1991_2000 = movies[(movies['year'] >= 1991) & (movies['year'] <= 2000)]
movies_2001_2010 = movies[(movies['year'] >= 2001) & (movies['year'] <= 2010)]

movie_dfs = [
    movies_1921_1930,
    movies_1931_1940,
    movies_1941_1950,
    movies_1951_1960,
    movies_1961_1970,
    movies_1971_1980,
    movies_1981_1990,
    movies_1991_2000,
    movies_2001_2010
]

variable_names = [
    'movies_1921_1930',
    'movies_1931_1940',
    'movies_1941_1950',
    'movies_1951_1960',
    'movies_1961_1970',
    'movies_1971_1980',
    'movies_1981_1990',
    'movies_1991_2000',
    'movies_2001_2010'
]

In [43]:
movies_1931_1940

Unnamed: 0,m0,line#,char#,charName,text,movieName,year,rating,IMDBvotes#,genre
3082,m103,L310944,u1530,ANDREWS,"[worried, lovington, after, all, something, mi...",it happened one night,1934,8.3,25577.0,['comedy' 'romance']
3083,m103,L310943,u1542,LOVINGTON,"[no, he, been, trailed, twenty, four, hours, d...",it happened one night,1934,8.3,25577.0,['comedy' 'romance']
3084,m103,L310942,u1530,ANDREWS,"[it, not, enough, are, you, certain, she, not,...",it happened one night,1934,8.3,25577.0,['comedy' 'romance']
3085,m103,L310941,u1542,LOVINGTON,"[ve, put, extra, men, on, all, along, the, way]",it happened one night,1934,8.3,25577.0,['comedy' 'romance']
3086,m103,L310940,u1530,ANDREWS,"[what, asking, isn, impossible, my, daughter, ...",it happened one night,1934,8.3,25577.0,['comedy' 'romance']
...,...,...,...,...,...,...,...,...,...,...
284099,m90,L283912,u1376,WILSON,"[who, hildy, johnson, she, just, stepped, out,...",his girl friday,1940,8.1,20870.0,['comedy' 'drama' 'romance']
284100,m90,L283871,u1374,SCHWARTZ,"[give, that, marriage, six, months]",his girl friday,1940,8.1,20870.0,['comedy' 'drama' 'romance']
284101,m90,L283870,u1376,WILSON,"[baldwin, his, name, is]",his girl friday,1940,8.1,20870.0,['comedy' 'drama' 'romance']
284102,m90,L283711,u1374,SCHWARTZ,"[his, honor, the, mayor, now, comes, out, with...",his girl friday,1940,8.1,20870.0,['comedy' 'drama' 'romance']


In [46]:
# Go through all of the dataframes and repeat the model building process 
for movie, movie_df in zip(movie_dfs, variable_names):
    
    # Create the model
    model = gensim.models.Word2Vec(
        window=10,
        min_count=2,
        workers=4
    )
    
    # Build vocab
    model.build_vocab(movie_text, progress_per=1000)
    
    # Train the model
    model.train(
        movie_text, 
        total_examples=model.corpus_count, 
        epochs=5
    )
    
    # Save the model with DataFrame name
    model_name = f"word2vec-{movie_df}-text.model"
    model.save(model_name)

Now we have all of the models for each decade saved! What you can do now is load all of them in, and play around with them to compare decades against one another. For example, calculate the cosine similarity between 'woman' and 'weak', and 'woman' and 'strong' and compare how these have changed over the years.