In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import json

api_key1 = "b58bb13f"
api_key2 = "9bf3218a"

RANDOM_STATE = 1212

# Preparing the CMU corpus.

In [3]:
# Importing the dataset

cmu_corpus = pd.read_csv("movie.metadata.tsv", sep="\t", header=None)
cmu_corpus.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [4]:
# Adding column names to the dataset, pruning unnecessary columns

cmu_corpus.columns = ["movie_id", 1 ,"movie_name", "year", 4, 5, 6, 7, "genre"]
cmu_corpus["year"] = cmu_corpus["year"].astype(str).str[:4]

cmu_corpus.drop([1, 4, 5, 6, 7], axis=1, inplace=True)
cmu_corpus.head()

Unnamed: 0,movie_id,movie_name,year,genre
0,975900,Ghosts of Mars,2001,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,"{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,Brun bitter,1988,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,White Of The Eye,1987,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,A Woman in Flames,1983,"{""/m/07s9rl0"": ""Drama""}"


In [5]:
# Creating dataframe from plot_summaries.txt
plot_summaries = pd.read_csv("plot_summaries.txt", sep="\t", header=None)
plot_summaries.columns = ["movie_id", "plot"]

# Convert the 'movie_id' column in both dataframes to int
cmu_corpus['movie_id'] = cmu_corpus['movie_id'].astype(int)
plot_summaries['movie_id'] = plot_summaries['movie_id'].astype(int)

# Merging the dataframes on the 'movie_id' column
cmu_corpus = pd.merge(cmu_corpus, plot_summaries, on="movie_id")
cmu_corpus.head()

Unnamed: 0,movie_id,movie_name,year,genre,plot
0,975900,Ghosts of Mars,2001,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,1987,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...
2,261236,A Woman in Flames,1983,"{""/m/07s9rl0"": ""Drama""}","Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,2002,"{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,1997,"{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a..."


In [6]:
# Cleaning up the genre column

genres = [] 

for i in cmu_corpus['genre']: 
  genres.append(list(json.loads(i).values())) 

cmu_corpus['genre_new'] = genres

cmu_corpus.drop(columns=["genre"], inplace=True)
cmu_corpus.head()

Unnamed: 0,movie_id,movie_name,year,plot,genre_new
0,975900,Ghosts of Mars,2001,"Set in the second half of the 22nd century, th...","[Thriller, Science Fiction, Horror, Adventure,..."
1,9363483,White Of The Eye,1987,A series of murders of rich young women throug...,"[Thriller, Erotic thriller, Psychological thri..."
2,261236,A Woman in Flames,1983,"Eva, an upper class housewife, becomes frustra...",[Drama]
3,18998739,The Sorcerer's Apprentice,2002,"Every hundred years, the evil Morgana returns...","[Family Film, Fantasy, Adventure, World cinema]"
4,6631279,Little city,1997,"Adam, a San Francisco-based artist who works a...","[Romantic comedy, Ensemble Film, Comedy-drama,..."


In [7]:
# check for missing values in the dataframe
cmu_movies = cmu_corpus[~(cmu_corpus['genre_new'].str.len() == 0)]
cmu_movies.shape, cmu_corpus.shape


((41793, 5), (42204, 5))

# Putting together version 1 of the final dataset.

##### To create the dataset we will work with, we take 1500 objects from each genre (thriller, horror, comedy and drama). All movies with overlapping genres are not included. First, we shuffle the dataset to limit any potential bias.


In [8]:
cmu_movies.head()

Unnamed: 0,movie_id,movie_name,year,plot,genre_new
0,975900,Ghosts of Mars,2001,"Set in the second half of the 22nd century, th...","[Thriller, Science Fiction, Horror, Adventure,..."
1,9363483,White Of The Eye,1987,A series of murders of rich young women throug...,"[Thriller, Erotic thriller, Psychological thri..."
2,261236,A Woman in Flames,1983,"Eva, an upper class housewife, becomes frustra...",[Drama]
3,18998739,The Sorcerer's Apprentice,2002,"Every hundred years, the evil Morgana returns...","[Family Film, Fantasy, Adventure, World cinema]"
4,6631279,Little city,1997,"Adam, a San Francisco-based artist who works a...","[Romantic comedy, Ensemble Film, Comedy-drama,..."


In [9]:
cmu_movies.shape

(41793, 5)

In [15]:
# Shuffle the dataset

cmu_movies_shuffled = cmu_movies.sample(frac=1, random_state=RANDOM_STATE)
cmu_movies_shuffled.reset_index(drop=True, inplace=True)

# Extracting 1500 movies from each genre

comedy_movies = cmu_movies_shuffled[cmu_movies_shuffled['genre_new'].apply(lambda x: 'Comedy' in x and not any(genre in x for genre in ['Horror', 'Thriller', 'Drama']))]
comedy_movies = comedy_movies.head(1500)
cmu_movies_shuffled.drop(comedy_movies.index, inplace=True)

horror_movies = cmu_movies_shuffled[cmu_movies_shuffled['genre_new'].apply(lambda x: 'Horror' in x and not any(genre in x for genre in ['Comedy', 'Thriller', 'Drama']))]
horror_movies = horror_movies.head(1500)
cmu_movies_shuffled.drop(horror_movies.index, inplace=True)

thriller_movies = cmu_movies_shuffled[cmu_movies_shuffled['genre_new'].apply(lambda x: 'Thriller' in x and not any(genre in x for genre in ['Comedy', 'Horror', 'Drama']))]
thriller_movies = thriller_movies.head(1500)
cmu_movies_shuffled.drop(thriller_movies.index, inplace=True)

drama_movies = cmu_movies_shuffled[cmu_movies_shuffled['genre_new'].apply(lambda x: 'Drama' in x and not any(genre in x for genre in ['Comedy', 'Horror', 'Thriller']))]
drama_movies = drama_movies.head(1500)
cmu_movies_shuffled.drop(drama_movies.index, inplace=True)

cmu_dataset = pd.concat([comedy_movies, horror_movies, thriller_movies, drama_movies])
cmu_dataset.reset_index(drop=True, inplace=True)

cmu_movies_shuffled.reset_index(drop=True, inplace=True)
cmu_movies_shuffled.to_csv("cmu_movies_shuffled.csv", index=False)

cmu_movies_shuffled.shape, comedy_movies.shape, horror_movies.shape, thriller_movies.shape, drama_movies.shape, cmu_dataset.shape

((35793, 5), (1500, 5), (1500, 5), (1500, 5), (1500, 5), (6000, 5))

In [14]:
# Remove all unnecessary genres

cmu_dataset['genre_new'] = cmu_dataset['genre_new'].apply(lambda x: [i for i in x if i in ['Thriller', 'Horror', 'Comedy', 'Drama']])
movies3 = cmu_dataset[cmu_dataset['genre_new'].str.len() != 0]

cmu_dataset.to_csv("cmu_dataset_v1.csv", index=False)

cmu_dataset.head()

Unnamed: 0,movie_id,movie_name,year,plot,genre_new
0,24551035,Helen's Babies,1924,Toodie and Budge are identified as the two bes...,[Comedy]
1,26508404,The Croods,2013,"Surviving in a volcanic world is tough enough,...",[Comedy]
2,682628,The Crimson Permanent Assurance,1983,The elderly British employees of the Permanent...,[Comedy]
3,8914341,Taking Five,2007,Devon Thompson and Gabby Davis are the ultim...,[Comedy]
4,12035515,Fellowship of the Dice,2005,The movie tracks three plot lines: interviews ...,[Comedy]
