In [11]:
import pandas as pd 
import numpy as np 
import ast
import re

# Clean CMU dataset and filter horror movies

In [8]:
CMU_headers = ['Weekipedia_ID', 'Freebase_ID', 'Name', 'Release_date', 'Revenue', 'Runtime', 'Language', 'Countries', 'Genres']
CMU = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', names=CMU_headers)

In [13]:
CMU['Release_year'] = CMU['Release_date'].astype(str).str.extract(r'(\d{4})')
CMU['Release_year'] = pd.to_numeric(CMU['Release_year'])

In [14]:
def extract_words(column):
    return column.apply(ast.literal_eval).apply(lambda x: list(x.values()))

cleaning_columns = ['Language', 'Genres', 'Countries']

for column_name in cleaning_columns:
    CMU[column_name+"_clean"] = extract_words(CMU[column_name])

def clean_language(language_string):
    return re.sub(" Language", "", language_string)

CMU.Language_clean = CMU.Language_clean.apply(lambda lang_list: list(map(clean_language, lang_list)))

isHorrorMovie = CMU['Genres_clean'].apply(lambda l: 'Horror' in l)
horror_df = CMU[isHorrorMovie]

In [51]:
CMU = CMU.drop(['Release_date', 'Language', 'Countries', 'Genres'], axis=1)

In [52]:
CMU_horror_df = CMU[CMU['Genres_clean'].apply(lambda l: 'Horror' in l)]

CMU_horror_df.shape

(5280, 9)

# Clean additional horror movies dataset

In [34]:
horror_df = pd.read_csv('horror_movies.csv')

In [53]:
horror_df['Release_year'] = horror_df['release_date'].astype(str).str.extract(r'(\d{4})')
horror_df['Release_year'] = pd.to_numeric(horror_df['Release_year'])
horror_df = horror_df.drop(['original_title', 'poster_path', 'status', 'adult', 'backdrop_path', 'collection', 'release_date'], axis=1)

horror_df.shape

(32540, 14)

In [62]:
horror_df['genre_names'] = horror_df['genre_names'].str.split(',')

# Merging the two datasets 

In [67]:
Horror_movies = pd.merge(CMU_horror_df, horror_df, left_on='Name',right_on='title', how='outer')

In [68]:
Horror_movies['Name'] = Horror_movies['Name'].combine_first(Horror_movies['title'])
Horror_movies['Release_year'] = Horror_movies['Release_year_x'].combine_first(Horror_movies['Release_year_y'])
Horror_movies['Runtime'] = Horror_movies['Runtime'].combine_first(Horror_movies['runtime'])
Horror_movies['Revenue'] = Horror_movies['Revenue'].combine_first(Horror_movies['revenue'])
Horror_movies['Genres'] = Horror_movies['Genres_clean'].combine_first(Horror_movies['genre_names'])

Horror_movies = Horror_movies.drop(['title', 'Release_year_x', 'Release_year_y', 'runtime', 'revenue', 'Genres_clean', 'genre_names'], axis=1)

In [71]:
Horror_movies['ID'] = Horror_movies.index + 1 
Horror_movies = Horror_movies.drop('id', axis=1)

# Creating a text file with all summaries

In [74]:
CMU_plot_summaries = pd.read_csv('HorrorMovieSummaries.txt', sep='\t', header=None, names=['Weekipedia_ID', 'summary'])

In [75]:
merged_df = Horror_movies.copy()

In [76]:
# Ensure that both 'wiki_id' columns are of the same type
CMU_plot_summaries['Weekipedia_ID'] = CMU_plot_summaries['Weekipedia_ID'].astype(str)
merged_df['Weekipedia_ID'] = merged_df['Weekipedia_ID'].astype(str)

# Merge on 'wiki_id' to get the summaries in merged_df
merged_df = pd.merge(merged_df, CMU_plot_summaries, on='Weekipedia_ID', how='left')

In [79]:
merged_df['Summary'] = merged_df['overview'].combine_first(merged_df['summary'])

Unnamed: 0,Weekipedia_ID,Freebase_ID,Name,Revenue,Runtime,Language_clean,Countries_clean,original_language,overview,tagline,popularity,vote_count,vote_average,budget,collection_name,Release_year,Genres,ID,summary,Summary
0,,,#1915House,0.0,55.0,,,en,A century of secrets are hidden behind the fre...,,0.6,0.0,0.0,700.0,,2018.0,[Horror],1,,A century of secrets are hidden behind the fre...
1,,,#Alive,13416285.0,98.0,,,ko,"As a grisly virus rampages a city, a lone man ...",You must survive.,50.907,1365.0,7.3,6300000.0,,2020.0,"[Action, Horror, Thriller]",2,,"As a grisly virus rampages a city, a lone man ..."
2,,,#Blue_Whale,0.0,93.0,,,ru,"The story of a girl who, in an attempt to unde...",,0.84,0.0,0.0,0.0,,2021.0,"[Drama, Horror, Thriller]",3,,"The story of a girl who, in an attempt to unde..."
3,,,#Captured,0.0,81.0,,,en,A zealous vigilante looking to clean the Inter...,Cleansing the internet of all sin,4.197,13.0,3.2,0.0,,2017.0,"[Horror, Thriller]",4,,A zealous vigilante looking to clean the Inter...
4,,,#EATPRETTY,0.0,4.0,,,en,"Anna is a successful product photographer, str...",If at first you don't succeed...,0.6,1.0,2.0,0.0,,2018.0,"[Horror, Mystery, Romance]",5,,"Anna is a successful product photographer, str..."


In [80]:
# Define the output file path
output_file = 'Summaries.txt'

# Write summaries to the output file
with open(output_file, 'w') as f:
    for _, row in merged_df.iterrows():
        # Write each line in the format: new ID followed by the summary
        f.write(f"{row['ID']}\t{row['Summary']}\n")

In [81]:
# Define the output file path
output_file = 'Taglines.txt'

# Write taglines to the output file, checking for non-NaN values
with open(output_file, 'w') as f:
    for _, row in merged_df.iterrows():
        # Check if 'tagline' is not NaN
        if pd.notna(row['tagline']):
            # Write each line in the format: new ID followed by the tagline
            f.write(f"{row['ID']}\t{row['tagline']}\n")

In [83]:
Horror_movies = Horror_movies.drop(['overview', 'tagline'], axis=1)

# Get the cleaned data

In [84]:
Horror_movies.to_csv('Horror_Movies_Clean.csv')