In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from enum import Enum
%pip install gensim
%pip install nltk

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /home/leo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
class Data(Enum):
    MOVIES = "movie.metadata.tsv"
    PLOTS = "plot_summaries.txt"
    NAME_CLUSTERS = "name.clusters.txt"

def load_data(type : Data, columns, index_col = ""):
    return pd.read_csv(
        f'./rawdata/MovieSummaries/{type.value}', 
        sep="\t", 
        names=columns,
        index_col=index_col if index_col != "" else columns[0]
    )

movie_df = load_data(
    Data.MOVIES, 
    ["wiki_id", "freebase_id", "name", "release_date", "revenue", "runtime", "languages", "countries", "genres"],
)
plot_df = load_data(
    Data.PLOTS, 
    ["wiki_id", "plot"]
)
name_df = load_data(
    Data.NAME_CLUSTERS, 
    ["character_name", "freebase_id"],
    index_col="freebase_id"
)

In [34]:
import re

# Parses the multi value structure of the dataset format into a sorted array of items
# Returns null values for empty lists as NaN (so all missing values we use are NaN)
def parse_multi_values(x): 
    if not x: 
        return []
    x = x.split(',')
    result = []
    for combo in x: 
        combo = combo.split(":")
        if len(combo) > 1: 
            combo[1] = re.sub("[^a-zA-Z0-9\s]+", "", combo[1])
            combo[1] = re.sub("^\s", "", combo[1])
            combo[1] = combo[1].strip()
            result.append(combo[1])
    if not result:
        return []
    return sorted(result)

# Add plot summaries to the movie df
movie_df = movie_df.join(plot_df)



# Parses the language column specifically by removing "Langauge" from each entry
def parse_language(languages):
    if not languages or type(languages) != list:
        return []
    return list(map(lambda l : l.replace("Language", "").strip(), languages))

# Parses the date format into an int representing only the year. Values with missing entries are given NaN.
def parse_date(x):
    if not x or x == np.nan:
        return np.NaN
    return str(x).split("-")[0]

# Parse multi value attributes into arrays
for attribute in ["languages", "countries", "genres"]:
    movie_df[attribute] = movie_df[attribute].apply(parse_multi_values)

# Remove "Language" from language column
movie_df["languages"] = movie_df["languages"].apply(parse_language)

# Parse release-date to all use the year as an int (don't need increased granularity right now)
movie_df["release_date"] = movie_df["release_date"].apply(parse_date)

# Replace missing numeric values with zero and convert to int rather than float
for col in ["runtime", "revenue"]:
    movie_df[col] = movie_df[col].fillna(0)
    movie_df[col] = movie_df[col].astype(int)
movie_df = movie_df[movie_df['plot'].notna()]
movie_df

Unnamed: 0_level_0,freebase_id,name,release_date,revenue,runtime,languages,countries,genres,plot
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
975900,/m/03vyhn,Ghosts of Mars,2001,14010832,98,[English],[United States of America],"[Action, Adventure, Horror, Science Fiction, S...","Set in the second half of the 22nd century, th..."
9363483,/m/0285_cd,White Of The Eye,1987,0,110,[English],[United Kingdom],"[Erotic thriller, Psychological thriller, Thri...",A series of murders of rich young women throug...
261236,/m/01mrr1,A Woman in Flames,1983,0,106,[German],[Germany],[Drama],"Eva, an upper class housewife, becomes frustra..."
18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,0,86,[English],[South Africa],"[Adventure, Family Film, Fantasy, World cinema]","Every hundred years, the evil Morgana returns..."
6631279,/m/0gffwj,Little city,1997,0,93,[English],[United States of America],"[Comedy, Comedydrama, Drama, Ensemble Film, Ro...","Adam, a San Francisco-based artist who works a..."
...,...,...,...,...,...,...,...,...,...
23851782,/m/06_vb43,The Ghost Train,1941,0,82,[English],[United Kingdom],"[Comedy, Crime Fiction, Supernatural, Thriller]",{{plot}} The film opens with a Great Western e...
35228177,/m/0j7hxnt,Mermaids: The Body Found,2011,0,120,[English],[United States of America],[Drama],Two former National Oceanic Atmospheric Admini...
34980460,/m/0g4pl34,Knuckle,2011,0,96,[English],"[Ireland, United Kingdom]","[Biographical film, Documentary, Drama]",{{No plot}} This film follows 12 years in the ...
913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992,0,150,[Japanese],[Japan],"[Adventure, Animation, Anime, Drama, Japanese ...","The story takes place in the year 2092,The Sup..."


In [35]:
import string

def parseDocument(doc):
        # TODO: probably don't need stuff to remove puctuation eg since that should be handled by word_tokenizer
        return word_tokenize(doc.lower().replace('\n', ' ').replace('\r', '').translate(str.maketrans('', '', string.punctuation)))

# Tokenize all plots 
tokenized_df = movie_df.copy() 
tokenized_summaries = tokenized_df['plot'].values
tokenized_summaries = [TaggedDocument(parseDocument(doc), [idx]) for idx, doc in enumerate(tokenized_summaries)]


In [36]:
import pathlib
import multiprocessing

# Loads existing model from disk if available else trains new model
# Set force_train_new to True to train a new model even if a previous model exists
def create_doc2vec_model(documents, force_train_new = False):
    fname = str(pathlib.Path().resolve()) + "/doc2vec_model"
    if not force_train_new:
        # Try to load, if it succeeds then previous model existed so return that one
        try:
            return Doc2Vec.load(fname)
        except:
            print("Failed to load existing model")

    # Train model
    print("Start training of Doc2Vec model...")
    model = Doc2Vec(
        documents, 
        window=2, 
        min_count=1, 
        workers=multiprocessing.cpu_count(), 
        epochs = 100
    )
    
    # Persist the model
    print(f"Finished training Doc2Vec, saving it to file {fname}")
    model.save(fname)
    return model

# Train the doc2vec model
model = create_doc2vec_model(tokenized_summaries, force_train_new=False)

Start training of Doc2Vec model...
Finished training Doc2Vec, saving it to file /home/leo/sync/kurser/cs401-AppliedDataAnalysis/ada-2022-project-adawizardry/doc2vec_model


In [51]:
apollo_wiki = """Apollo 13 (April 11–17, 1970) was the seventh crewed mission in the Apollo space program and the third meant to land on the Moon. The craft was launched from Kennedy Space Center on April 11, 1970, but the lunar landing was aborted after an oxygen tank in the service module (SM) failed two days into the mission. The crew instead looped around the Moon and returned safely to Earth on April 17. The mission was commanded by Jim Lovell, with Jack Swigert as command module (CM) pilot and Fred Haise as Lunar Module (LM) pilot. Swigert was a late replacement for Ken Mattingly, who was grounded after exposure to rubella.

A routine stir of an oxygen tank ignited damaged wire insulation inside it, causing an explosion that vented the contents of both of the SM's oxygen tanks to space. Without oxygen, needed for breathing and for generating electric power, the SM's propulsion and life support systems could not operate. The CM's systems had to be shut down to conserve its remaining resources for reentry, forcing the crew to transfer to the LM as a lifeboat. With the lunar landing canceled, mission controllers worked to bring the crew home alive.

Although the LM was designed to support two men on the lunar surface for two days, Mission Control in Houston improvised new procedures so it could support three men for four days. The crew experienced great hardship, caused by limited power, a chilly and wet cabin and a shortage of potable water. There was a critical need to adapt the CM's cartridges for the carbon dioxide scrubber system to work in the LM; the crew and mission controllers were successful in improvising a solution. The astronauts' peril briefly renewed public interest in the Apollo program; tens of millions watched the splashdown in the South Pacific Ocean on television.

An investigative review board found fault with preflight testing of the oxygen tank and Teflon being placed inside it. The board recommended changes, including minimizing the use of potentially combustible items inside the tank; this was done for Apollo 14. The story of Apollo 13 has been dramatized several times, most notably in the 1995 film Apollo 13 – based on Lost Moon, the 1994 memoir co-authored by Lovell – and an episode of the 1998 miniseries From the Earth to the Moon."""
test_doc = parseDocument(apollo_wiki)
test_vector = model.infer_vector(test_doc)

top_similar = model.dv.most_similar(positive=[test_vector],topn=100)
# Reminder, id here is the natural index from movie_df
top_similar_df = pd.DataFrame(top_similar, columns=["id", "similarity"]).set_index("id")

movie_df = movie_df.reset_index(drop=True)

top_similar_df = pd.merge(top_similar_df, movie_df, left_index=True, right_index=True)[["name", "similarity", "release_date", "plot"]]
top_similar_df

# top_similar_df = 

# top_similar_df = top_similar_df.set_index("wiki_id")
# top_similar_df.head(n=30)

# Get similarity for all wiki pages of events and save results to new with the similarity score 

# (event, year)
# (movie, similarity)
# Has the event caused a change in the *trend*?
# Approaches:
# * avg per similarity per year? -> hard to find change, diluted by other events?
# * look at movies with similarity score over some threshold to find all movies that are somewhat similar and do line plot of those movies
# # -> downside is that a very similar movie and not similar movie is weighted equally. -> use similarity as weight?
# * 


Unnamed: 0,name,similarity,release_date,plot
41722,Apollo 13,0.660164,1995,"On July 20, 1969, veteran astronaut Jim Lovell..."
41063,The Dream is Alive,0.649144,1985,The movie includes scenes from numerous shuttl...
27366,Rocket Man,0.641467,1997,NASA is training for the first manned mission ...
33620,Sunshine,0.607212,2007,"In 2050, the failure of the Earth's Sun threat..."
22286,World Without Sun,0.603604,1964,"World Without Sun, a documentary produced and ..."
...,...,...,...,...
3581,Airport '77,0.482915,1977,"A privately owned luxury Boeing 747-100, Steve..."
8129,Leviathan,0.482759,1989,Tri-Oceanic Corp has hired undersea miners for...
1091,Airplane!,0.482534,1980,Ex-fighter pilot and taxi driver Ted Striker ...
16497,Dark Star,0.482451,1974,"In the middle of the 22nd century, mankind has..."


In [38]:
index = movie_df.iloc[19136].name
movie_df.loc[index]

freebase_id                                            /m/05h54kz
name                                                        Nikah
release_date                                                 1998
revenue                                                         0
runtime                                                         0
languages                                                  [Urdu]
countries                                              [Pakistan]
genres                                                    [Drama]
plot            The story is about two different classes; Shaa...
Name: 19136, dtype: object