In [1]:
import pandas as pd 
import numpy as np 
import ast
import re

# Clean CMU dataset and filter horror movies

In [2]:
CMU_headers = ['Weekipedia_ID', 'Freebase_ID', 'Name', 'Release_date', 'Revenue', 'Runtime', 'Language', 'Countries', 'Genres']
CMU = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', names=CMU_headers)

In [3]:
CMU['Release_year'] = CMU['Release_date'].astype(str).str.extract(r'(\d{4})')
CMU['Release_year'] = pd.to_numeric(CMU['Release_year'])

In [4]:
def extract_words(column):
    return column.apply(ast.literal_eval).apply(lambda x: list(x.values()))

cleaning_columns = ['Language', 'Genres', 'Countries']

for column_name in cleaning_columns:
    CMU[column_name+"_clean"] = extract_words(CMU[column_name])

def clean_language(language_string):
    return re.sub(" Language", "", language_string)

CMU.Language_clean = CMU.Language_clean.apply(lambda lang_list: list(map(clean_language, lang_list)))

isHorrorMovie = CMU['Genres_clean'].apply(lambda l: 'Horror' in l)
horror_df = CMU[isHorrorMovie]

In [5]:
CMU = CMU.drop(['Release_date', 'Language', 'Countries', 'Genres'], axis=1)

In [6]:
CMU_horror_df = CMU[CMU['Genres_clean'].apply(lambda l: 'Horror' in l)]

CMU_horror_df.shape

(5280, 9)

# Clean additional horror movies dataset

In [7]:
horror_df = pd.read_csv('horror_movies.csv')

In [8]:
horror_df['Release_year'] = horror_df['release_date'].astype(str).str.extract(r'(\d{4})')
horror_df['Release_year'] = pd.to_numeric(horror_df['Release_year'])
horror_df = horror_df.drop(['original_title', 'poster_path', 'status', 'adult', 'backdrop_path', 'collection', 'release_date'], axis=1)

horror_df.shape

(32540, 14)

In [9]:
horror_df['genre_names'] = horror_df['genre_names'].str.split(',')

# Merging the two datasets 

In [10]:
Horror_movies = pd.merge(CMU_horror_df, horror_df, left_on='Name',right_on='title', how='outer')
Horror_movies = pd.merge(
    CMU_horror_df, 
    horror_df, 
    left_on=['Name', 'Release_year'], 
    right_on=['title', 'Release_year'], 
    how='outer'
)

In [11]:
Horror_movies['Name'] = Horror_movies['Name'].combine_first(Horror_movies['title'])
Horror_movies['Runtime'] = Horror_movies['Runtime'].combine_first(Horror_movies['runtime'])
Horror_movies['Revenue'] = Horror_movies['Revenue'].combine_first(Horror_movies['revenue'])
Horror_movies['Genres'] = Horror_movies['Genres_clean'].combine_first(Horror_movies['genre_names'])

Horror_movies = Horror_movies.drop(['title', 'runtime', 'revenue', 'Genres_clean', 'genre_names'], axis=1)
Horror_movies['Name'] = Horror_movies['Name'].str.replace(r'^[!#]+', '', regex=True)


In [12]:
Horror_movies['ID'] = Horror_movies.index + 1 
Horror_movies = Horror_movies.drop('id', axis=1)

# Creating a text file with all summaries

In [13]:
CMU_plot_summaries = pd.read_csv('HorrorMovieSummaries.txt', sep='\t', header=None, names=['Weekipedia_ID', 'summary'])

In [14]:
merged_df = Horror_movies.copy()

In [15]:
# Ensure that both 'wiki_id' columns are of the same type
CMU_plot_summaries['Weekipedia_ID'] = CMU_plot_summaries['Weekipedia_ID'].astype(str)
merged_df['Weekipedia_ID'] = merged_df['Weekipedia_ID'].astype(str)

# Merge on 'wiki_id' to get the summaries in merged_df
merged_df = pd.merge(merged_df, CMU_plot_summaries, on='Weekipedia_ID', how='left')

In [16]:
merged_df['Summary'] = merged_df['overview'].combine_first(merged_df['summary'])

In [17]:
# Define the output file path
output_file = 'Summaries.txt'

# Write summaries to the output file
with open(output_file, 'w') as f:
    for _, row in merged_df.iterrows():
        # Write each line in the format: new ID followed by the summary
        f.write(f"{row['ID']}\t{row['Summary']}\n")

In [18]:
# Define the output file path
output_file = 'Taglines.txt'

# Write taglines to the output file, checking for non-NaN values
with open(output_file, 'w') as f:
    for _, row in merged_df.iterrows():
        # Check if 'tagline' is not NaN
        if pd.notna(row['tagline']):
            # Write each line in the format: new ID followed by the tagline
            f.write(f"{row['ID']}\t{row['tagline']}\n")

In [19]:
Horror_movies = Horror_movies.drop(['overview', 'tagline'], axis=1)

# Get the cleaned data

In [20]:
Horror_movies.to_csv('Horror_Movies_Clean.csv')

# Metadata processing 

In [21]:
character_columns = [
   'Wikipedia movie ID', 'Freebase Movie ID', 'Movie release date','Character Name', 'Actor DOB', 'Actor gender', 'Actor height', 'Actor ethnicity', 
    'Actor Name', 'Actor age at movie release', 'Freebase character map1', 'Freebase character map2', 'Freebase character map3'
]
character_metadata = pd.read_csv('MovieSummaries/character.metadata.tsv', sep='\t', names = character_columns)

actors_and_movies = pd.merge(Horror_movies, character_metadata, left_on='Weekipedia_ID', right_on='Wikipedia movie ID', how='left')
actors_and_movies['Name'] = actors_and_movies['Name'].str.replace(r'^[!#]+', '', regex=True)

In [22]:
actors_and_movies.head()

Unnamed: 0,Weekipedia_ID,Freebase_ID,Name,Revenue,Runtime,Release_year,Language_clean,Countries_clean,original_language,popularity,...,Character Name,Actor DOB,Actor gender,Actor height,Actor ethnicity,Actor Name,Actor age at movie release,Freebase character map1,Freebase character map2,Freebase character map3
0,,,1915House,0.0,55.0,2018.0,,,en,0.6,...,,,,,,,,,,
1,,,Alive,13416285.0,98.0,2020.0,,,ko,50.907,...,,,,,,,,,,
2,,,Blue_Whale,0.0,93.0,2021.0,,,ru,0.84,...,,,,,,,,,,
3,,,Captured,0.0,81.0,2017.0,,,en,4.197,...,,,,,,,,,,
4,,,EATPRETTY,0.0,4.0,2018.0,,,en,0.6,...,,,,,,,,,,


In [23]:
ethnicity_dict={
    "/m/01kb9y": "Multiracial",
    "/m/05qb937": "Venezuelans",
    "/m/09v5bdn": "Puerto Ricans",
    "/m/02pfy17": "Syrian people",
    "/m/013xrm": "Germans",
    "/m/01n94b": "Slovaks",
    "/m/02w7gg": "English people",
    "/m/0x67": "African American",
    "/m/011bn6ys": None,
    "/m/0118b8ry": None,
    "/m/03bkbh": "Irish people",
    "/m/0318mh": "Finns",
    "/m/027hhf": "Arbëreshë people",
    "/m/04c28": "Kurds",
    "/m/0cx3p": "Berbers",
    "/m/032j30": "Native Hawaiians",
    "/m/0gcp7x": "Iranian Azerbaijanis",
    "/m/013xrm": "Germans",
    "/m/038723": "Greek American",
    "/m/0d2by": "Chinese American",
    "/m/09vc4s": "English American",
    "/m/0912ll": "Dominican American",
    "/m/07mqps": "Dutch-American",
    "/m/01qhm_": "German American",
    "/m/0dbxy": "Cherokee",
    "/m/013s41": "Bulgarians",
    "/m/01km_m": "Slovenes",
    "/m/02ctzb": "White people",
    "/m/033tf_": "Irish American",
    "/m/0222qb": "Italian people",
    "/m/0jt85pd": "Greeks",
    "/m/03w9xlf": "Filipino Italian",
    "/m/0j251_s": "Arabs in France",
    "/m/0bwhd5z": "Harari people",
    "/m/0k0t_dz": "Caucasian race",
    "/m/09743": "Pashtun",
    "/m/03lmx1": "Scottish people",
    "/m/0bpjh3": "Bengalis",
    "/m/0j63_pr": "French Canadian American",
    "/m/0jt8h6f": "Latin Americans",
    "/m/02gx2x": "Javanese people",
    "/m/048z7l": "Jewish American",
    "/m/03ts0c": "French people",
    "/m/013s3n": "Czechs",
    "/m/0268_k": "Danes",
    "/m/059_w": "Native Americans in the United States",
    "/m/09kr66": "Russian American",
    "/m/0f3v0": "Comanche",
    "/m/09743": "Pashtun",
    "/m/09vc4s": "English American",
    "/m/0bpjh3": "Bengalis",
    "/m/0x67": "African American",
    "/m/0j3c70b": "Jamaicans",
    "/m/0dryh9k": "Indian people",
    "/m/09vc4s": "English American",
    "/m/0dbxy": "Cherokee",
    "/m/03bkbh": "Irish people",
    "/m/02pfy17": "Syrian people",
    "/m/0cx3p": "Berbers",
    "/m/03bkbh": "Irish people",
    "/m/02ctzb": "White people",
    "/m/02w7gg": "English people",
    "/m/033tf_": "Irish American",
    "/m/09vc4s": "English American",
    "/m/0bwhd5z": "Harari people",
    "/m/0k0t_dz": "Caucasian race",
    "/m/013xrm": "Germans",
    "/m/09743": "Pashtun",
    "/m/03w9xlf": "Filipino Italian",
    "/m/0j251_s": "Arabs in France",
    "/m/04c28": "Kurds",
    "/m/0j63_pr": "French Canadian American",
    "/m/038723": "Greek American",
    "/m/0134vqyy": "Scottish American",
    "/m/0dbxy": "Cherokee",
    "/m/05qb937": "Venezuelans",
    "/m/033tf_": "Irish American",
    "/m/03bkbh": "Irish people",
    "/m/04c28": "Kurds",
    "/m/0j251_s": "Arabs in France",
    "/m/0bpjh3": "Bengalis",
    "/m/0dbxy": "Cherokee",
    "/m/038723": "Greek American"
}

actors_and_movies['Ethnicity'] = actors_and_movies['Actor ethnicity'].map(ethnicity_dict)