In [32]:
%pip install gensim
%pip install nltk
%pip install keybert
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import pathlib
import multiprocessing
from keybert import KeyBERT

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need 

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1123)>


In [33]:
from enum import Enum
import pandas as pd
import re
import numpy as np

class Data(Enum):
    MOVIES = "movie.metadata.tsv"
    PLOTS = "plot_summaries.txt"
    NAME_CLUSTERS = "name.clusters.txt"

def load_data(type : Data, columns, index_col = ""):
    """
    Loads data of type using the Data Enum, with columns, and index_col as the index if provided. The data must be in a folder named MovieSummaries.

    returns: DataFrame
    """
    return pd.read_csv(
        #f'./rawdata/MovieSummaries/{type.value}',
        f'MovieSummaries/{type.value}', 
        sep="\t", 
        names=columns,
        index_col=index_col if index_col != "" else columns[0]
    )

orig_movie_df = load_data(
    Data.MOVIES, 
    ["wiki_id", "freebase_id", "name", "release_date", "revenue", "runtime", "languages", "countries", "genres"],
)
plot_df = load_data(
    Data.PLOTS, 
    ["wiki_id", "plot"]
)

In [34]:
def parse_multi_values(x): 
    """
    Parses the multi value structure of the dataset format into a sorted array of items
    Returns null values for empty lists as NaN (so all missing values we use are NaN)
    
    returns: array of strings extracted from x
    """
    if not x: 
        return []
    x = x.split(',')
    result = []
    for combo in x: 
        combo = combo.split(":")
        if len(combo) > 1: 
            combo[1] = re.sub("[^a-zA-Z0-9\s]+", "", combo[1])
            combo[1] = re.sub("^\s", "", combo[1])
            combo[1] = combo[1].strip()
            result.append(combo[1])
    if not result:
        return []
    return sorted(result)

def parse_language(languages):
    """
    Parses the language column specifically by removing "Langauge" from each entry

    returns: list of strings of the languages
    """
    if not languages or type(languages) != list:
        return []
    return list(map(lambda l : l.replace("Language", "").strip(), languages))

def parse_date(x):
    """
    Parses the date format into an int representing only the year. Values with missing entries are given NaN.

    returns: string representing the year 
    """
    if not x or x == np.nan:
        return np.NaN
    return str(x).split("-")[0]

def data_clean_movies(df):
    # Add plot summaries to the movie dataframe
    df = df.join(plot_df)

    # Parse multi value attributes into arrays
    for attribute in ["languages", "countries", "genres"]:
        df[attribute] = df[attribute].apply(parse_multi_values)

    # Remove "Language" from language column
    df["languages"] = df["languages"].apply(parse_language)

    # Parse release-date to all use the year as an int (don't need increased granularity right now)
    df["release_date"] = df["release_date"].apply(parse_date)

    # Replace missing numeric values with zero and convert to int rather than float
    for col in ["runtime", "revenue"]:
        df[col] = df[col].fillna(0)
        df[col] = df[col].astype(int)
    
    return df

# Create df that is cleaned
movie_df = data_clean_movies(orig_movie_df)

In [35]:
import string

def parseDocument(doc):
        """
        Parses a string into a format that is more suitable for later NLP applications.

        returns: parsed string
        """
        return word_tokenize(
                doc
                .lower()
                # Remove certain special characters
                .replace('\n', '')
                .replace('\r', '')
                # Remove punctuation and special characters
                .translate(str.maketrans('', '', string.punctuation))
        )

# Create new df where we remove all rows with missing plots
plot_removed = data_clean_movies(orig_movie_df)
plot_removed =  plot_removed[plot_removed['plot'].notna()]

In [36]:
plot_removed.head(1)

Unnamed: 0_level_0,freebase_id,name,release_date,revenue,runtime,languages,countries,genres,plot
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
975900,/m/03vyhn,Ghosts of Mars,2001,14010832,98,[English],[United States of America],"[Action, Adventure, Horror, Science Fiction, S...","Set in the second half of the 22nd century, th..."


In [39]:
kw_model = KeyBERT()
global progress = 0

def key_bert(x):
    if progress % 100 == 0:
        print(progress)
    progress += 1
    return kw_model.extract_keywords(x, keyphrase_ngram_range=(1, 2), top_n=30, stop_words=None)

plot_removed['keyword'] = plot_removed['plot'].apply(key_bert)

UnboundLocalError: local variable 'progress' referenced before assignment

In [None]:
# First, we tokenize all plot summaries
tokenized_df = plot_removed.copy()
tokenized_summaries = tokenized_df['keyword'].values
tokenized_summaries = [TaggedDocument(parseDocument(doc), [idx]) for idx, doc in enumerate(tokenized_summaries)]

KeyError: 'keyword'

In [None]:
doc = """
         Apollo 11 (July 16–24, 1969) was the American spaceflight that first landed humans on the Moon. Commander Neil Armstrong and lunar module pilot Buzz Aldrin landed the Apollo Lunar Module Eagle on July 20, 1969, at 20:17 UTC, and Armstrong became the first person to step onto the Moon's surface six hours and 39 minutes later, on July 21 at 02:56 UTC. Aldrin joined him 19 minutes later, and they spent about two and a quarter hours together exploring the site they had named Tranquility Base upon landing. Armstrong and Aldrin collected 47.5 pounds (21.5 kg) of lunar material to bring back to Earth as pilot Michael Collins flew the Command Module Columbia in lunar orbit, and were on the Moon's surface for 21 hours, 36 minutes before lifting off to rejoin Columbia.

Apollo 11 was launched by a Saturn V rocket from Kennedy Space Center on Merritt Island, Florida, on July 16 at 13:32 UTC, and it was the fifth crewed mission of NASA's Apollo program. The Apollo spacecraft had three parts: a command module (CM) with a cabin for the three astronauts, the only part that returned to Earth; a service module (SM), which supported the command module with propulsion, electrical power, oxygen, and water; and a lunar module (LM) that had two stages—a descent stage for landing on the Moon and an ascent stage to place the astronauts back into lunar orbit.

After being sent to the Moon by the Saturn V's third stage, the astronauts separated the spacecraft from it and traveled for three days until they entered lunar orbit. Armstrong and Aldrin then moved into Eagle and landed in the Sea of Tranquility on July 20. The astronauts used Eagle's ascent stage to lift off from the lunar surface and rejoin Collins in the command module. They jettisoned Eagle before they performed the maneuvers that propelled Columbia out of the last of its 30 lunar orbits onto a trajectory back to Earth.[8] They returned to Earth and splashed down in the Pacific Ocean on July 24 after more than eight days in space.

Armstrong's first step onto the lunar surface was broadcast on live TV to a worldwide audience. He described the event as "one small step for [a] man, one giant leap for mankind."[a][14] Apollo 11 effectively proved US victory in the Space Race to demonstrate spaceflight superiority, by fulfilling a national goal proposed in 1961 by President John F. Kennedy, "before this decade is out, of landing a man on the Moon and returning him safely to the Earth
      """
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1,2), top_n=30)

In [None]:
kw_string = ""
for s in keywords:
    kw_string += s[0] + " "

In [None]:
kw_string

'landed apollo apollo 11 nasa apollo landing moon apollo lunar entered lunar astronauts lunar landing armstrong armstrong lunar columbia apollo apollo spacecraft 14 apollo lift lunar apollo columbia lunar apollo program lunar module program apollo humans moon spaceflight landed space armstrong sent moon moon returning lunar orbit 30 lunar step lunar neil armstrong lunar aldrin landed moon ascent '