In [2]:
#imports
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import json
import Helper
from transformers import BertTokenizer, BertModel
import torch


sns.set_style("whitegrid")
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.

In [3]:
#read tsv file and add headers
movie_metadata_df = pd.read_csv('data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None, 
                names=['wiki_movie_id', 
                        'freebase_movie_id', 
                        'movie_name', 
                        'movie_release_date', 
                        'movie_box_office_revenue', 
                        'movie_runtime', 
                        'movie_languages', 
                        'movie_countries', 
                        'movie_genres'])

movie_metadata_df.loc[movie_metadata_df['movie_name'] == 'Zero Tolerance', 'movie_runtime'] = 88
movie_metadata_df.loc[movie_metadata_df['movie_name'] == 'Hunting Season', 'movie_release_date'] = '2010-12-02'


#change movie_release_date to pandas datetime
movie_metadata_df['movie_release_date'] = pd.to_datetime(movie_metadata_df['movie_release_date'], format='%Y-%m-%d', errors='coerce')
movie_metadata_df.head(10)

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,NaT,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,NaT,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,NaT,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
5,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen..."
6,18998739,/m/04jcqvw,The Sorcerer's Apprentice,NaT,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant..."
7,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ..."
8,9997961,/m/06_y2j7,Contigo y aquí,NaT,,,"{""/m/06nm1"": ""Spanish Language""}","{""/m/0jgd"": ""Argentina""}","{""/m/04t36"": ""Musical"", ""/m/07s9rl0"": ""Drama"",..."
9,2345652,/m/075f66,City of the Dead,NaT,,76.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/03npn"": ""Horror"", ""/m/0fdjb"": ""Supernatur..."


In [4]:
#read text file and add headers
plot_summaries_df = pd.read_csv('data/MovieSummaries/plot_summaries.txt', sep='\t', header=None, 
                names=['wiki_movie_id', 
                        'plot_summary'])
#remove any {{ }} from the plot summary text
plot_summaries_df['plot_summary'] = plot_summaries_df['plot_summary'].str.replace(r'\{\{.*?\}\}', '', regex=True)
plot_summaries_df.head(10)


Unnamed: 0,wiki_movie_id,plot_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
5,5272176,The president is on his way to give a speech. ...
6,1952976,"The film opens in 1974, as a young girl, Dahl..."
7,24225279,"The story begins with Hannah, a young Jewish t..."
8,2462689,Infuriated at being told to write one final co...
9,20532852,A line of people drool at the window of the s...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_text(text):
    # Create a TF-IDF vectorizer with a minimum document frequency of 1
    vectorizer = TfidfVectorizer(min_df=1)
    
    if text.strip():
        # Fit the vectorizer on the input text and transform the text into a TF-IDF matrix
        tfidf_matrix = vectorizer.fit_transform([text])
        
        # Convert the TF-IDF matrix to a dense array and return the vectorized representation
        vectorized_text = tfidf_matrix.toarray()
    else:
        # Return a zero vector if the text is empty
        vectorized_text = [0]

    return vectorized_text


In [6]:
# Vectorize each plot summary and add the vector representation to a new column
plot_summaries_df['vectorized_summary'] = plot_summaries_df['plot_summary'].apply(vectorize_text)

# Display the updated DataFrame
plot_summaries_df.head(10)

Unnamed: 0,wiki_movie_id,plot_summary,vectorized_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...","[[0.19611613513818404, 0.19611613513818404, 0...."
1,31186339,The nation of Panem consists of a wealthy Capi...,"[[0.024540339127743678, 0.06135084781935919, 0..."
2,20663735,Poovalli Induchoodan is sentenced for six yea...,"[[0.016893434459987148, 0.033786868919974296, ..."
3,2231378,"The Lemon Drop Kid , a New York City swindler,...","[[0.01949665965537125, 0.009748329827685625, 0..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...,"[[0.02161183615368384, 0.02161183615368384, 0...."
5,5272176,The president is on his way to give a speech. ...,"[[0.02205941986694528, 0.02205941986694528, 0...."
6,1952976,"The film opens in 1974, as a young girl, Dahl...","[[0.03281875025953022, 0.010939583419843408, 0..."
7,24225279,"The story begins with Hannah, a young Jewish t...","[[0.014480592720167369, 0.028961185440334738, ..."
8,2462689,Infuriated at being told to write one final co...,"[[0.01606853383165671, 0.01606853383165671, 0...."
9,20532852,A line of people drool at the window of the s...,"[[0.06119900613621046, 0.06119900613621046, 0...."


In [7]:
plot_summaries_df['vectorized_summary'][3].shape

(1, 372)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()