In [None]:
import pandas as pd
import json
import re


RANDOM_STATE = 1212

# Preparing the CMU corpus.

In [None]:
# Importing the dataset
cmu_corpus = pd.read_csv("movie.metadata.tsv", sep="\t", header=None)
cmu_corpus.head()

In [None]:
# Adding column names to the dataset, pruning unnecessary columns
cmu_corpus.columns = ["movie_id", 1 ,"movie_name", "year", 4, 5, 6, 7, "genre"]
cmu_corpus["year"] = cmu_corpus["year"].astype(str).str[:4]

cmu_corpus.drop([1, 4, 5, 6, 7], axis=1, inplace=True)
cmu_corpus.head()

In [None]:
# Creating dataframe from plot_summaries.txt
plot_summaries = pd.read_csv("plot_summaries.txt", sep="\t", header=None)
plot_summaries.columns = ["movie_id", "plot"]

# Clean the plot summaries from citations and references
plot_summaries['plot'] = plot_summaries['plot'].apply(
    lambda x: re.sub(r'\{.*?\}|<ref.*\}|\{.*\/>', '', x))

# Remove URL's
plot_summaries['plot'] = plot_summaries['plot'].apply(
    lambda x: re.sub(r'http\S+', '', x))

# Convert the 'movie_id' column in both dataframes to int
cmu_corpus['movie_id'] = cmu_corpus['movie_id'].astype(int)
plot_summaries['movie_id'] = plot_summaries['movie_id'].astype(int)

# Revome all plot summaries with less than 2 sentences
plot_summaries = plot_summaries[plot_summaries["plot"].apply(lambda x: x.count(".") >= 2)]

# Merging the dataframes on the 'movie_id' column
cmu_corpus = pd.merge(cmu_corpus, plot_summaries, on="movie_id")
cmu_corpus.head()

In [None]:
# Cleaning up the genre column

genres = [] 

for i in cmu_corpus['genre']: 
  genres.append(list(json.loads(i).values())) 

cmu_corpus['genre_new'] = genres

cmu_corpus.drop(columns=["genre"], inplace=True)
cmu_corpus.rename(columns={"genre_new": "genre"}, inplace=True)

cmu_corpus.head()

In [None]:
# Checking for missing values in the dataframe in the genre- and year columns and removing them
cmu_movies = cmu_corpus[~(cmu_corpus['genre'].str.len() == 0)]
cmu_movies = cmu_movies[~(cmu_movies['year'] == "nan")]
cmu_movies.shape, cmu_corpus.shape


# Putting together the CMU dataset.

##### To create a testing dataset to work with, we take 1590 objects from each genre (thriller, horror, comedy and drama). All movies with overlapping genres are not included. First, we shuffle the dataset to limit any potential bias.

##### All movies fitting the criterias are put in a backup dataset, movies from backup will be used for OMDb API calls to fill that dataset and later fill a matching CMU dataset.

In [None]:
cmu_movies = cmu_movies.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

comedy_movies = cmu_movies[cmu_movies['genre'].apply(lambda x: 'Comedy' in x and not any(genre in x for genre in ['Horror', 'Thriller', 'Drama']))]
comedy_movies.loc[:, 'genre'] = 'Comedy'

horror_movies = cmu_movies[cmu_movies['genre'].apply(lambda x: 'Horror' in x and not any(genre in x for genre in ['Comedy', 'Thriller', 'Drama']))]
horror_movies.loc[:, 'genre'] = 'Horror'

thriller_movies = cmu_movies[cmu_movies['genre'].apply(lambda x: 'Thriller' in x and not any(genre in x for genre in ['Horror', 'Comedy', 'Drama']))]
thriller_movies.loc[:, 'genre'] = 'Thriller'

drama_movies = cmu_movies[cmu_movies['genre'].apply(lambda x: 'Drama' in x and not any(genre in x for genre in ['Horror', 'Thriller', 'Comedy']))]
drama_movies.loc[:, 'genre'] = 'Drama'

cmu_backup = pd.concat([comedy_movies, horror_movies, thriller_movies, drama_movies])


# Used to create a test dataset

# cmu_dataset = pd.concat([comedy_movies.head(1590), horror_movies.head(1590), thriller_movies.head(1590), drama_movies.head(1590)])
# cmu_dataset.reset_index(drop=True, inplace=True)
# cmu_dataset.to_csv("cmu_dataset_v3.csv", index=False)


cmu_backup.reset_index(drop=True, inplace=True)


cmu_backup.to_csv("cmu_backup.csv", index=False)

cmu_backup.shape, comedy_movies.shape, horror_movies.shape, thriller_movies.shape, drama_movies.shape

# ONLY RUN THIS AFTER COMPLETING THE IMDb DATASET - THIS WILL TRY TO MATCH THE MOVIES FROM IMDb

In [None]:
# Matching columns with the imdb dataset
imdb = pd.read_csv("imdb_data_final.csv")

cmu_backup = pd.read_csv("cmu_backup.csv")

cmu_backup.drop(columns=["movie_id"], inplace=True)

cmu_backup = cmu_backup[["movie_name", "year", "genre", "plot"]]

cmu_backup['year'] = cmu_backup['year'].astype(int)

In [None]:
imdb.shape, cmu_backup.shape

In [None]:
imdb.head()

In [None]:
cmu_backup.head()

In [None]:
# Add every movie from the imdb dataset to the cmu dataset from cmu_backup

cmu = pd.DataFrame(columns=["movie_name", "year", "genre", "plot"])

for index, row in imdb.iterrows():
    movie_name = row["movie_name"]
    year = row["year"]
    genre = row["genre"]
    plot = row["plot"]

    if cmu_backup[(cmu_backup["movie_name"] == movie_name) & (cmu_backup["year"] == year) & (cmu_backup["genre"] == genre)].shape[0] > 0:
        cmu = pd.concat([cmu, cmu_backup[(cmu_backup["movie_name"] == movie_name) & (cmu_backup["year"] == year) & (cmu_backup["genre"] == genre)]])

In [None]:
cmu.to_csv("cmu_data_final_Vx.csv", index=False)