In [20]:
import pandas as pd
import numpy as np
import json
import re


RANDOM_STATE = 1212

# Preparing the CMU corpus.

In [21]:
# Importing the dataset
cmu_corpus = pd.read_csv("movie.metadata.tsv", sep="\t", header=None)
cmu_corpus.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [22]:
# Adding column names to the dataset, pruning unnecessary columns
cmu_corpus.columns = ["movie_id", 1 ,"movie_name", "year", 4, 5, 6, 7, "genre"]
cmu_corpus["year"] = cmu_corpus["year"].astype(str).str[:4]

cmu_corpus.drop([1, 4, 5, 6, 7], axis=1, inplace=True)
cmu_corpus.head()

Unnamed: 0,movie_id,movie_name,year,genre
0,975900,Ghosts of Mars,2001,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,"{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,Brun bitter,1988,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,White Of The Eye,1987,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,A Woman in Flames,1983,"{""/m/07s9rl0"": ""Drama""}"


In [23]:
# Creating dataframe from plot_summaries.txt
plot_summaries = pd.read_csv("plot_summaries.txt", sep="\t", header=None)
plot_summaries.columns = ["movie_id", "plot"]

# Clean the plot summaries from citations and references
plot_summaries['plot'] = plot_summaries['plot'].apply(
    lambda x: re.sub(r'\{.*?\}|<ref.*\}|\{.*\/>', '', x))

# Remove URL's
plot_summaries['plot'] = plot_summaries['plot'].apply(
    lambda x: re.sub(r'http\S+', '', x))

# Convert the 'movie_id' column in both dataframes to int
cmu_corpus['movie_id'] = cmu_corpus['movie_id'].astype(int)
plot_summaries['movie_id'] = plot_summaries['movie_id'].astype(int)

# Revome all plot summaries with less than 2 sentences
plot_summaries = plot_summaries[plot_summaries["plot"].apply(lambda x: x.count(".") >= 2)]

# Merging the dataframes on the 'movie_id' column
cmu_corpus = pd.merge(cmu_corpus, plot_summaries, on="movie_id")
cmu_corpus.head()

Unnamed: 0,movie_id,movie_name,year,genre,plot
0,975900,Ghosts of Mars,2001,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,1987,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...
2,261236,A Woman in Flames,1983,"{""/m/07s9rl0"": ""Drama""}","Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,2002,"{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,1997,"{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a..."


In [24]:
# Cleaning up the genre column

genres = [] 

for i in cmu_corpus['genre']: 
  genres.append(list(json.loads(i).values())) 

cmu_corpus['genre_new'] = genres

cmu_corpus.drop(columns=["genre"], inplace=True)
cmu_corpus.rename(columns={"genre_new": "genre"}, inplace=True)

cmu_corpus.head()

Unnamed: 0,movie_id,movie_name,year,plot,genre
0,975900,Ghosts of Mars,2001,"Set in the second half of the 22nd century, th...","[Thriller, Science Fiction, Horror, Adventure,..."
1,9363483,White Of The Eye,1987,A series of murders of rich young women throug...,"[Thriller, Erotic thriller, Psychological thri..."
2,261236,A Woman in Flames,1983,"Eva, an upper class housewife, becomes frustra...",[Drama]
3,18998739,The Sorcerer's Apprentice,2002,"Every hundred years, the evil Morgana returns...","[Family Film, Fantasy, Adventure, World cinema]"
4,6631279,Little city,1997,"Adam, a San Francisco-based artist who works a...","[Romantic comedy, Ensemble Film, Comedy-drama,..."


In [25]:
# Checking for missing values in the dataframe in the genre- and year columns and removing them
cmu_movies = cmu_corpus[~(cmu_corpus['genre'].str.len() == 0)]
cmu_movies = cmu_movies[~(cmu_movies['year'] == "nan")]
cmu_movies.shape, cmu_corpus.shape


((37604, 5), (40349, 5))

# Putting together version 1 of the final dataset.

##### To create the dataset we will work with, we take 1500 objects from each genre (thriller, horror, comedy and drama). All movies with overlapping genres are not included. First, we shuffle the dataset to limit any potential bias.


In [26]:
cmu_movies = cmu_movies.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

comedy_movies = cmu_movies[cmu_movies['genre'].apply(lambda x: 'Comedy' in x and not any(genre in x for genre in ['Horror', 'Thriller', 'Drama']))]
comedy_movies.loc[:, 'genre'] = 'Comedy'

horror_movies = cmu_movies[cmu_movies['genre'].apply(lambda x: 'Horror' in x and not any(genre in x for genre in ['Comedy', 'Thriller', 'Drama']))]
horror_movies.loc[:, 'genre'] = 'Horror'

thriller_movies = cmu_movies[cmu_movies['genre'].apply(lambda x: 'Thriller' in x and not any(genre in x for genre in ['Horror', 'Comedy', 'Drama']))]
thriller_movies.loc[:, 'genre'] = 'Thriller'

drama_movies = cmu_movies[cmu_movies['genre'].apply(lambda x: 'Drama' in x and not any(genre in x for genre in ['Horror', 'Thriller', 'Comedy']))]
drama_movies.loc[:, 'genre'] = 'Drama'

cmu_backup = pd.concat([comedy_movies, horror_movies, thriller_movies, drama_movies])


# used for creating a test dataset
# cmu_dataset = pd.concat([comedy_movies.head(1500), horror_movies.head(1500), thriller_movies.head(1500), drama_movies.head(1500)])
# cmu_dataset.reset_index(drop=True, inplace=True)
# cmu_dataset.to_csv("cmu_dataset_v3.csv", index=False)

# OBSOLETE
# remove every object in cmu_dataset from cmu_backup using indexes
# cmu_backup.drop(cmu_dataset.index, inplace=True)

cmu_backup.reset_index(drop=True, inplace=True)


cmu_backup.to_csv("cmu_backup.csv", index=False)

cmu_backup.shape, comedy_movies.shape, horror_movies.shape, thriller_movies.shape, drama_movies.shape

((21758, 5), (6060, 5), (1961, 5), (2009, 5), (11728, 5))

In [27]:
#count how many movies of each genre in backup
cmu_backup['genre'].value_counts()

genre
Drama       11728
Comedy       6060
Thriller     2009
Horror       1961
Name: count, dtype: int64

In [28]:
imdb = pd.read_csv("imdb_data_final.csv")

cmu_backup.drop(columns=["movie_id"], inplace=True)

cmu_backup = cmu_backup[["movie_name", "year", "genre", "plot"]]

# set cmu_backup 'year' column to int
cmu_backup['year'] = cmu_backup['year'].astype(int)

In [29]:
imdb.shape

(5722, 4)

In [30]:
imdb.head()

Unnamed: 0,movie_name,year,genre,plot
0,"Go West, Young Man",1936,Comedy,"Movie star Mavis Arden, as amorous in private ..."
1,Re-Animated,2006,Comedy,Jimmy is the kid everybody ignores and uses. O...
2,Blue Blazes,1936,Comedy,"Buster becomes a fireman, but unfortunately no..."
3,Meet the Baron,1933,Comedy,The famous Baron Munchausen dumps two dimwits ...
4,I Got the Hook Up,1998,Comedy,Two broke buddies feel lucky when they come up...


In [31]:
cmu_backup.head()

Unnamed: 0,movie_name,year,genre,plot
0,"Go West, Young Man",1936,Comedy,"Mavis Arden , is a movie star who gets romanti..."
1,Re-Animated,2006,Comedy,Jimmy Roberts is a 12-year-old boy who can't ...
2,Blue Blazes,1936,Comedy,"Elmer becomes a fireman, but not a particular..."
3,Zhizn i priklyucheniya chetyrekh druzei 1/2,1980,Comedy,Three dogs and one cat are naturally suspiciou...
4,Meet the Baron,1933,Comedy,A couple of bungling idiots are abandoned in ...


In [32]:
#check type for each column
cmu_backup.dtypes

movie_name    object
year           int32
genre         object
plot          object
dtype: object

In [33]:
# for every row in imdb, check if the movie is in cmu_backup
# check movie_name, year and genre
# if it is, add it to cmu from cmu_backup

cmu = pd.DataFrame(columns=["movie_name", "year", "genre", "plot"])

for index, row in imdb.iterrows():
    movie_name = row["movie_name"]
    year = row["year"]
    genre = row["genre"]
    plot = row["plot"]

    if cmu_backup[(cmu_backup["movie_name"] == movie_name) & (cmu_backup["year"] == year) & (cmu_backup["genre"] == genre)].shape[0] > 0:
        cmu = pd.concat([cmu, cmu_backup[(cmu_backup["movie_name"] == movie_name) & (cmu_backup["year"] == year) & (cmu_backup["genre"] == genre)]])

cmu.to_csv("cmu_data_final.csv", index=False)

In [34]:
#check duplicates cmu
imdb.shape, cmu.shape

((5722, 4), (5724, 4))

In [36]:
""" cmu = pd.DataFrame(columns=["movie_name", "year", "genre", "plot"])

# for every movie in the imdb dataset, find the movie with the same name, year and genre in cmu_backup
# add it from cmu_backup to cmu

from fuzzywuzzy import fuzz

# Define a function to match movie title and return the best match
def match_movie(title, list_movies, min_score=0):
    # Get a list of matches
    matches = [(movie, fuzz.ratio(title, movie)) for movie in list_movies]

    # Only get matches with a score greater than min_score
    matches = [match for match in matches if match[1] >= min_score]

    # Sort the matches by score in descending order
    matches = sorted(matches, key=lambda x: x[1], reverse=True)

    # If there is at least one match, return the first one (highest score)
    if len(matches) > 0:
        return matches[0][0]
    else:
        return None

# Get a list of movie titles from cmu_backup
cmu_movies = cmu_backup['movie_name'].tolist()

for index, row in imdb.iterrows():
    # Use fuzzy matching to find the best match for movie_name
    matched_movie_name = match_movie(row['movie_name'], cmu_movies, min_score=70)
    
    if matched_movie_name is not None:
        movie = cmu_backup[(cmu_backup['movie_name'] == matched_movie_name) & (cmu_backup['year'] == row['year']) & (cmu_backup['genre'] == row['genre'])]
        if movie.empty:
            movie = cmu_backup[(cmu_backup['movie_name'] == matched_movie_name) & (cmu_backup['year'] - 1 == row['year']) & (cmu_backup['genre'] == row['genre'])]
            if movie.empty:
                print(f"No match for {row['movie_name']} ({row['year']}, {row['genre']})")
            else:
                cmu = pd.concat([cmu, movie])
        else:
            cmu = pd.concat([cmu, movie])

cmu.shape """

' cmu = pd.DataFrame(columns=["movie_name", "year", "genre", "plot"])\n\n# for every movie in the imdb dataset, find the movie with the same name, year and genre in cmu_backup\n# add it from cmu_backup to cmu\n\nfrom fuzzywuzzy import fuzz\n\n# Define a function to match movie title and return the best match\ndef match_movie(title, list_movies, min_score=0):\n    # Get a list of matches\n    matches = [(movie, fuzz.ratio(title, movie)) for movie in list_movies]\n\n    # Only get matches with a score greater than min_score\n    matches = [match for match in matches if match[1] >= min_score]\n\n    # Sort the matches by score in descending order\n    matches = sorted(matches, key=lambda x: x[1], reverse=True)\n\n    # If there is at least one match, return the first one (highest score)\n    if len(matches) > 0:\n        return matches[0][0]\n    else:\n        return None\n\n# Get a list of movie titles from cmu_backup\ncmu_movies = cmu_backup[\'movie_name\'].tolist()\n\nfor index, row i