# Preprocessing the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import string
import nltk

from nltk.corpus import stopwords


# nltk.download('stopwords')

## Plot summaries

We want to process the movie summaries to analyze them.

In [2]:
# Load dataset metadata
data_folder = '../../data/MovieSummaries/'
df_plot = pd.read_csv(data_folder + 'plot_summaries.txt', sep="\t", header=None)
df_plot.columns = ["Wikipedia_movie_ID", "Plot_summary"]

df_plot[['Wikipedia_movie_ID', 'Plot_summary']]

Unnamed: 0,Wikipedia_movie_ID,Plot_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


We are preprocessing the data of the movie summaries using the following steps:
1. Remove all punctuations
2. Put all characters of the text in lower case
3. Remove stop words

In [3]:
# Remove punctuation and transform all char in lower case
df_plot["Plot_summary"] = df_plot["Plot_summary"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df_plot[['Plot_summary']]

Unnamed: 0,Plot_summary
0,Shlykov a hardworking taxi driver and Lyosha a...
1,The nation of Panem consists of a wealthy Capi...
2,Poovalli Induchoodan is sentenced for six yea...
3,The Lemon Drop Kid a New York City swindler i...
4,Seventhday Adventist Church pastor Michael Cha...
...,...
42298,The story is about Reema a young Muslim schoo...
42299,In 1928 Hollywood director Leo Andreyev looks...
42300,American Luthier focuses on Randy Parsons’ tra...
42301,Abdur Rehman Khan a middleaged dry fruit sell...


In [4]:
# Remove stop words
# Use the nltk package to load english stop words
stop_words = set(stopwords.words('english'))

df_plot["Plot_summary"] = df_plot["Plot_summary"].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df_plot[['Plot_summary']]

Unnamed: 0,Plot_summary
0,Shlykov hardworking taxi driver Lyosha saxopho...
1,nation Panem consists wealthy Capitol twelve p...
2,Poovalli Induchoodan sentenced six years priso...
3,Lemon Drop Kid New York City swindler illegall...
4,Seventhday Adventist Church pastor Michael Cha...
...,...
42298,story Reema young Muslim schoolgirl Malabar lo...
42299,1928 Hollywood director Leo Andreyev looks pho...
42300,American Luthier focuses Randy Parsons’ transf...
42301,Abdur Rehman Khan middleaged dry fruit seller ...


In [5]:
# Save the cleaned data
df_plot.to_csv(data_folder + 'plot_summaries_cleaned.csv', index=False)

## Metadata

In [6]:
# Load metadata file
metadata_columns = [
    'Wikipedia_movie_ID', 
    'Freebase_movie_ID', 
    'Movie_name', 
    'Movie_release_date', 
    'Movie_box_office_revenue', 
    'Movie_runtime', 
    'Movie_languages', 
    'Movie_countries', 
    'Movie_genres'
]

df_metadata = pd.read_csv(data_folder + 'movie.metadata.tsv', delimiter='\t', names=metadata_columns)

In [7]:
# Format movie languages, countries and genres
def remove_id(original_str):
    if pd.isna(original_str) or original_str == '{}':
        return [] # if NaN, return empty list
    
    original_elems = original_str.strip('{}').split(',')
    new_elems = [elem.split(':', 1)[1].strip().strip('"') for elem in original_elems if ':' in elem]
    return new_elems

In [8]:
# Apply the remove_id for all columns that have one
df_metadata['Movie_languages'] = df_metadata['Movie_languages'].apply(remove_id)
df_metadata['Movie_countries'] = df_metadata['Movie_countries'].apply(remove_id)
df_metadata['Movie_genres'] = df_metadata['Movie_genres'].apply(remove_id)

In [9]:
# Keep only the years in the movie release date
def extract_year(date):
    # Check if the date is NaN or 'Unknown'
    if date == 'Unknown' or pd.isna(date):
        return None
    # If it's only the year, return it as an integer
    elif len(str(date)) == 4:
        return int(date)
    else:
        # Otherwise, try to parse it as a date and get the year
        try:
            return int(pd.to_datetime(date).year)
        except:
            return None

df_metadata['Movie_release_date'] = df_metadata['Movie_release_date'].apply(extract_year)

In [10]:
df_metadata

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001.0,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,/m/0crgdbh,Brun bitter,1988.0,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]"
3,9363483,/m/0285_cd,White Of The Eye,1987.0,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri..."
4,261236,/m/01mrr1,A Woman in Flames,1983.0,,106.0,[German Language],[Germany],[Drama]
...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011.0,,120.0,[English Language],[United States of America],[Drama]
81737,34980460,/m/0g4pl34,Knuckle,2011.0,,96.0,[English Language],"[Ireland, United Kingdom]","[Biographical film, Drama, Documentary]"
81738,9971909,/m/02pygw1,Another Nice Mess,1972.0,,66.0,[English Language],[United States of America],"[Satire, Comedy]"
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992.0,,150.0,[Japanese Language],[Japan],"[Science Fiction, Japanese Movies, Adventure, ..."


### Countries and continents

In [11]:
# All countries in the dataset
all_countries = set(country for countries in df_metadata['Movie_countries'].dropna() for country in countries)
all_countries = sorted(all_countries)

print("List of all unique countries:")
print(all_countries)

List of all unique countries:
['Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Belgium', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burma', 'Cambodia', 'Cameroon', 'Canada', 'Chile', 'China', 'Colombia', 'Congo', 'Costa Rica', 'Crime', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Czechoslovakia', 'Democratic Republic of the Congo', 'Denmark', 'Egypt', 'England', 'Estonia', 'Ethiopia', 'Federal Republic of Yugoslavia', 'Finland', 'France', 'Georgia', 'Georgian SSR', 'German Democratic Republic', 'German Language', 'Germany', 'Greece', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Iraqi Kurdistan', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Kingdom of Great Britain', 'Kingdom of Italy', 'Korea', 'Kuwait', 'Lebanon', 'Libya', 'Lithuania', 'Luxe

#### Some problems encountered with the countries
1. Typos: 'Ukrainian SSR' and 'Ukranian SSR'
2. German Language? Malayalam Language? Is it related to the country or is it only the language (mistake)?
3. South Korea and Korea?
These "problems" were solvable as we considered continents. For the German Language and Malayalam Language, we decided to not include them in any continent.

In [12]:
# Manually approximate continents with all countries
north_america = ['Aruba', 'Bahamas', 'Canada', 'Costa Rica', 'Cuba', 'Jamaica', 'Mexico', 
                 'Panama', 'Puerto Rico', 'United States of America']

south_america = ['Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Peru', 'Uruguay', 'Venezuela']

europe = ['Albania', 'Armenia', 'Austria', 'Azerbaijan', 'Belgium', 'Bosnia and Herzegovina','Bulgaria', 
          'Croatia', 'Cyprus', 'Czech Republic', 'Czechoslovakia', 'Denmark', 'England', 'Estonia', 
          'Federal Republic of Yugoslavia', 'Finland', 'France', 'Georgia', 'Georgian SSR', 
          'German Democratic Republic', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Isle of Man', 
          'Italy', 'Kingdom of Great Britain', 'Kingdom of Italy', 'Lithuania', 'Luxembourg', 'Malta', 
          'Monaco', 'Montenegro', 'Nazi Germany', 'Netherlands', 'Northern Ireland', 'Norway', 'Poland', 'Portugal', 
          'Republic of Macedonia', 'Romania', 'Russia', 'Scotland', 'Serbia', 'Serbia and Montenegro', 
          'Slovak Republic', 'Slovakia', 'Slovenia', 'Socialist Federal Republic of Yugoslavia', 
          'Soviet Union', 'Soviet occupation zone', 'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'Ukrainian SSR', 
          'United Kingdom', 'Wales', 'Weimar Republic', 
          'West Germany', 'Yugoslavia']

asia = ['Afghanistan', 'Armenia', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Bhutan', 'Burma', 'Cambodia', 'China', 
        'Georgia', 'Hong Kong', 'India', 'Indonesia', 'Iran', 'Iraq', 'Iraqi Kurdistan', 'Israel', 'Japan', 
        'Jordan', 'Kazakhstan', 'Korea', 'Kuwait', 'Lebanon', 'Macau', 'Malaysia', 'Mongolia', 'Nepal', 'Pakistan',
        'Palestinian Territories', 'Palestinian territories', 'Philippines', 'Qatar', 'Republic of China', 
        'Saudi Arabia', 'Singapore', 'South Korea', 'Sri Lanka', 'Syria', 'Taiwan', 'Thailand', 'Turkey', 
        'Turkmenistan', 'United Arab Emirates', 'Uzbek SSR', 'Uzbekistan', 'Vietnam']


africa = ['Algeria', 'Burkina Faso', 'Cameroon', 'Congo', 'Democratic Republic of the Congo', 'Egypt', 
          'Ethiopia', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Libya', 'Mali', 'Morocco', 'Nigeria', 'Senegal', 
          'South Africa', 'Sudan', 'Tunisia', 'Zambia', 'Zimbabwe']


middle_east = ['Bahrain', 'Iran', 'Iraq', 'Israel', 'Jordan', 'Kuwait', 'Lebanon', 'Mandatory Palestine', 
               'Qatar', 'Saudi Arabia', 'Syria', 'Turkey', 'United Arab Emirates', 'Yemen']

oceania = ['Australia', 'New Zealand']

In [13]:
# Analyze distribution genres wrt year (every 10 years) and countries
df_metadata['Movie_release_date'] = pd.to_numeric(df_metadata['Movie_release_date'], errors='coerce')
df_metadata['Decade'] = (df_metadata['Movie_release_date'].dropna() // 10 * 10).astype('Int64')

# Assign each movie to a continent
def get_continent(country):
    if country in north_america:
        return 'North America'
    elif country in south_america:
        return 'South America'
    elif country in europe:
        return 'Europe'
    elif country in asia:
        return 'Asia'
    elif country in africa:
        return 'Africa'
    elif country in middle_east:
        return 'Middle East'
    elif country in oceania:
        return 'Oceania'
    else:
        return 'Unknown'   # If country forgotten or invalid

df_metadata['Movie_continent'] = df_metadata['Movie_countries'].apply(lambda countries: get_continent(countries[0]) 
                                                                if isinstance(countries, list) and countries else 'Unknown')

In [14]:
# List all genres in the dataset
all_genres = set(genre for genres in df_metadata['Movie_genres'].dropna() for genre in genres)
all_genres = sorted(all_genres)

print("List of all unique genres:")
print(all_genres)

List of all unique genres:
['Absurdism', 'Acid western', 'Action', 'Action Comedy', 'Action Thrillers', 'Action/Adventure', 'Addiction Drama', 'Adult', 'Adventure', 'Adventure Comedy', 'Airplanes and airports', 'Albino bias', 'Alien Film', 'Alien invasion', 'Americana', 'Animal Picture', 'Animals', 'Animated Musical', 'Animated cartoon', 'Animation', 'Anime', 'Anthology', 'Anthropology', 'Anti-war', 'Anti-war film', 'Apocalyptic and post-apocalyptic fiction', 'Archaeology', 'Archives and records', 'Art film', 'Auto racing', 'Avant-garde', 'B-Western', 'B-movie', 'Backstage Musical', 'Baseball', 'Beach Film', 'Beach Party film', 'Bengali Cinema', 'Biker Film', 'Biographical film', 'Biography', 'Biopic [feature]', 'Black comedy', 'Black-and-white', 'Blaxploitation', 'Bloopers & Candid Camera', 'Bollywood', 'Boxing', 'Breakdance', 'British Empire Film', 'British New Wave', 'Bruceploitation', 'Buddy Picture', 'Buddy cop', 'Buddy film', 'Business', 'C-Movie', 'Camp', 'Caper story', 'Cavalry

### Issues with the genres
A lot of genres are highly redundant: 'Action', 'Action Comedy', 'Action Thrillers', 'Action/Adventure', 'Adventure', 'Adventure Comedy'.
To have a cleaner set of genres, we decided to merge them. Merging them was done manually, with 'Adventure Comedy' categorized into 'Adventure' and 'Comedy' for example.

The categorization is rudimental, but contains the most essential genres. In case of further analysis on genres, we will have to categorize them better.

In [15]:
# Merge genres that are the same 
# Hard-coding with genres that seem the most relevant
genre_groups = {
    'Action/Adventure': ['Action', 'Action Comedy', 'Action Thrillers', 'Action/Adventure', 
                         'Adventure', 'Adventure Comedy', 'Costume Adventure', 'Family-Oriented Adventure',
                         'Fantasy Adventure'],
    'Airplanes and airports': ['Airplanes and airports'],
    'Alien': ['Alien Film', 'Alien invasion'],
    'Animal': ['Animal Picture', 'Animals'],
    'Animation': ['Animated Musical', 'Clay animation', 'Computer Animation', 'Silhouette animation',
                  'Animated cartoon', 'Animation', 'Anime'],
    'Anti-war': ['Anti-war', 'Anti-war film'],
    'Beach': ['Beach Film', 'Beach Party film'],
    'Biography': ['Biographical film', 'Biography', 'Biopic [feature]'],
    'Bollywood': ['Indian Western', 'Bollywood'],
    'Buddy': ['Buddy Picture', 'Buddy cop', 'Buddy film', 'Female buddy film'],
    'Children': ["Children's", "Children's Entertainment", "Children's Fantasy", "Children's Issues", 
                 "Children's/Family"],
    'Comedy': ['Action Comedy', 'Adventure Comedy', 'Black comedy', 'Comdedy', 'Comedy', 'Comedy Thriller', 
               'Comedy Western', 'Comedy film', 'Comedy horror', 'Comedy of Errors', 'Comedy of manners', 
               'Comedy-drama', 'Courtroom Comedy', 'Crime Comedy', 'Domestic Comedy', 'Fantasy Comedy', 
               'Heavenly Comedy', 'Horror Comedy', 'Humour', 'Musical comedy', 'Romantic comedy', 'Tragicomedy', 
               'Sex comedy', 'Stand-up comedy', 'Screwball comedy', 'Workplace Comedy'],
    'Coming-of-age': ['Coming of age', 'Coming-of-age film'],
    'Costume': ['Costume Adventure', 'Costume Horror', 'Costume drama'],
    'Courtroom': ['Courtroom Comedy', 'Courtroom Drama'],
    'Crime': ['Crime', 'Crime Comedy', 'Crime Drama', 'Crime Fiction', 'Crime Thriller', 'Law & Crime',
              'Master Criminal Films'],
    'Detective': ['Detective', 'Detective fiction'],
    'Documentary': ['Docudrama', 'Documentary', 'Historical Documentaries', 'Political Documetary'],
    'Drama': ['Addiction Drama', 'Childhood Drama', 'Comedy-drama', 'Costume drama', 'Courtroom Drama', 
              'Crime Drama', 'Drama', 'Docudrama', 'Erotic Drama', 'Fantasy Drama', 'Family Drama', 
              'Inspirational Drama', 'Legal drama', 'Marriage Drama', 'Melodrama', 'Musical Drama', 
              'Political drama'],
    'Education': ['Education', 'Educational'],
    'Epic': ['Epic', 'Epic Western', 'Historical Epic'],
    'Erotic': ['Erotic Drama', 'Erotic thriller', 'Erotica'],
    'Fantasy': ['Fantasy', 'Fantasy Adventure', 'Fantasy Comedy', 'Fantasy Drama', 'Mythological Fantasy',
                'Romantic fantasy'],
    'Family': ['Family & Personal Relationships', 'Family Drama', 'Family Film', 'Family-Oriented Adventure'],
    'Fairy-tale': ['Fairy tale', 'Revisionist Fairy Tale'],
    'Fiction': ['Crime Fiction', 'Detective fiction', 'Historical fiction', 'Fictional film', 
                'Conspiracy fiction', 'Apocalyptic and post-apocalyptic fiction'],
    'Filipino': ['Filipino', 'Filipino Movies'],
    'Gay': ['Gay', 'Gay Interest', 'Gay pornography', 'Gay Themed'],
    'Gross-out': ['Gross out', 'Gross-out film'],
    'Historical': ['Film & Television History', 'Historical Documentaries', 'Historical drama', 
                   'Historical fiction', 'History', 'Historical Epic', 'World History'],
    'Horror': ['Haunted House Film', 'Horror', 'Horror Comedy', 'Natural horror films', 'Psychological horror',
               'Period Horror', 'Road-Horror'],
    'Law': ['Law & Crime', 'Legal drama'],
    'Media': ['Media Satire', 'Media Studies'],
    'Monster': ['Monster', 'Monster movie'],
    'Musical': ['Animated Musical', 'Backstage Musical', 'Music', 'Musical', 'Musical Drama', 'Musical comedy',
                'Instrumental Music', 'Jukebox musical', 'Singing cowboy'],
    'Nature': ['Natural disaster', 'Nature'],
    'Period': ['Period Horror', 'Period piece'],
    'Political': ['Political Documetary', 'Political cinema', 'Political drama', 'Political satire', 
                  'Political thriller'],
    'Pornography': ['Gay pornography', 'Hardcore pornography', 'Pornographic movie', 'Pornography'],
    'Prison': ['Prison', 'Prison escape', 'Prison film', 'Women in prison films'],
    'Psychology': ['Psycho-biddy', 'Psychological horror', 'Psychological thriller', 'Softcore Porn'],
    'Road': ['Road movie', 'Road-Horror', 'Roadshow theatrical release', 'Roadshow/Carny'],
    'Romance': ['Romance Film', 'Romantic comedy', 'Romantic drama', 'Romantic fantasy', 'Romantic thriller'],
    'Science Fiction': ['Sci Fi Pictures original films', 'Sci-Fi Adventure', 'Sci-Fi Horror', 'Sci-Fi Thriller', 
                        'Science Fiction', 'Science fiction Western'],
    'Sex': ['Sex comedy', 'Sexploitation'],
    'Social': ['Social issues', 'Social problem film'],
    'Space': ['Space opera', 'Space western'],
    'Sports': ['Extreme Sports', 'Sports'],
    'Spy': ['Glamorized Spy Film', 'Spy'],
    'Superhero': ['Superhero', 'Superhero movie'],
    'Sword': ['Sword and Sandal', 'Sword and sorcery', 'Sword and sorcery films'],
    'Television': ['Film & Television History', 'Television movie'],
    'Tragedy': ['Tragedy', 'Tragicomedy'],
    'Thriller': ['Action Thrillers', 'Comedy Thriller', 'Crime Thriller', 'Erotic thriller', 'Political thriller',
                 'Psychological thriller', 'Romantic thriller', 'Thriller'],
    'War': ['The Netherlands in World War II', 'Space western', 'Spaghetti Western', 'War effort', 'War film'],
    'Western': ['Acid western', 'B-Western', 'Comedy Western', 'Epic Western', 'Hybrid Western', 'Indian Western', 
                'Revisionist Western', 'Science fiction Western', 'Singing cowboy', 'Western'],
    'World': ['World History', 'World cinema'],
}

# Reverse the mapping for lookup
genre_to_group = {}
for group, genres in genre_groups.items():
    for genre in genres:
        genre_to_group[genre] = group

# Explode genres into separate rows
df_metadata_exploded = df_metadata.explode('Movie_genres')

# Identify genres that do not have a category in the genre_to_group mapping
unmapped_genres = df_metadata_exploded[~df_metadata_exploded['Movie_genres'].isin(genre_to_group)].dropna(subset=['Movie_genres'])

# Assign unmapped genres to their own group
if not unmapped_genres.empty:
    print("Genres with no category:")
    print(unmapped_genres['Movie_genres'].unique())
    
    # Add unmapped genres to the genre_to_group 
    for genre in unmapped_genres['Movie_genres'].unique():
        genre_to_group[genre] = genre  # Unassigned genre has its own group

df_metadata_exploded['Grouped_genres'] = df_metadata_exploded['Movie_genres'].map(genre_to_group)

# Group back by index and combine the lists into sets to avoid duplicates
df_metadata['Grouped_genres'] = df_metadata_exploded.groupby(df_metadata_exploded.index)['Grouped_genres'].agg(lambda x: list(set(x)))

df_metadata


Genres with no category:
['Supernatural' 'Mystery' 'Short Film' 'Silent film' 'Indie'
 'Black-and-white' 'Japanese Movies' 'Ensemble Film' 'Film adaptation'
 'Future noir' 'Suspense' 'Wuxia' 'Martial Arts Film' 'Chinese Movies'
 'Parody' 'Mockumentary' 'Fan film' 'Cult' 'Slapstick' 'Culture & Society'
 'Christmas movie' 'Malayalam Cinema' 'Blaxploitation' 'Satire'
 'Gangster Film' 'B-movie' 'Zombie Film' 'Slasher' 'Film noir'
 'New Hollywood' 'Experimental film' 'Surrealism' 'Avant-garde'
 'Creature Film' 'Gender Issues' 'LGBT' 'Americana' 'Concert film' 'Teen'
 'Gothic Film' 'Disaster' 'Art film' 'Escape Film' 'Dance' 'Combat Films'
 'C-Movie' 'Dystopia' 'Time travel' 'Doomsday film' 'Plague'
 'Propaganda film' 'Film' 'Medical fiction' 'Rockumentary' 'Feminist Film'
 'Film \\u00e0 clef' 'Pre-Code' 'Juvenile Delinquency Film'
 'Christian film' 'Movie serial' 'Linguistics' 'Language & Literature'
 'Remake' 'Dogme 95' 'Stop motion' 'Tollywood' 'Holiday Film' 'Giallo'
 'Anthology' 'Existe

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001.0,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",2000,North America,"[Science Fiction, Horror, Thriller, Supernatur..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",2000,North America,"[Mystery, Biography, Drama]"
2,28463795,/m/0crgdbh,Brun bitter,1988.0,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",1980,Europe,"[Fiction, Drama]"
3,9363483,/m/0285_cd,White Of The Eye,1987.0,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",1980,Europe,[Thriller]
4,261236,/m/01mrr1,A Woman in Flames,1983.0,,106.0,[German Language],[Germany],[Drama],1980,Europe,[Drama]
...,...,...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011.0,,120.0,[English Language],[United States of America],[Drama],2010,North America,[Drama]
81737,34980460,/m/0g4pl34,Knuckle,2011.0,,96.0,[English Language],"[Ireland, United Kingdom]","[Biographical film, Drama, Documentary]",2010,Europe,"[Biography, Drama, Documentary]"
81738,9971909,/m/02pygw1,Another Nice Mess,1972.0,,66.0,[English Language],[United States of America],"[Satire, Comedy]",1970,North America,"[Comedy, Satire]"
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992.0,,150.0,[Japanese Language],[Japan],"[Science Fiction, Japanese Movies, Adventure, ...",1990,Asia,"[Animation, Science Fiction, Drama, Japanese M..."


In [16]:
# Save the cleaned metadata
df_metadata.to_csv(data_folder + 'movies_metadata_cleaned.csv')

## Merge the datasets

In [17]:
df_cleaned_plot = pd.read_csv(data_folder + 'plot_summaries_cleaned.csv')
df_cleaned_metadata = pd.read_csv(data_folder + 'movies_metadata_cleaned.csv')

# Merge the plot summaries and metadata according to the movie ID
df_merged = pd.merge(df_cleaned_metadata, df_cleaned_plot, on='Wikipedia_movie_ID', how='inner')

# Save the cleaned merged dataframe
df_merged.to_csv(data_folder + 'movies_metadata_cleaned.csv')

In [18]:
df_merged

Unnamed: 0.1,Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres,Plot_summary
0,0,975900,/m/03vyhn,Ghosts of Mars,2001.0,14010832.0,98.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...",2000.0,North America,"['Science Fiction', 'Horror', 'Thriller', 'Sup...",Set second half 22nd century film depicts Mars...
1,3,9363483,/m/0285_cd,White Of The Eye,1987.0,,110.0,['English Language'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",1980.0,Europe,['Thriller'],series murders rich young women throughout Ari...
2,4,261236,/m/01mrr1,A Woman in Flames,1983.0,,106.0,['German Language'],['Germany'],['Drama'],1980.0,Europe,['Drama'],Eva upper class housewife becomes frustrated l...
3,6,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002.0,,86.0,['English Language'],['South Africa'],"['Family Film', 'Fantasy', 'Adventure', 'World...",2000.0,Africa,"['World', 'Fantasy', 'Family', 'Action/Adventu...",Every hundred years evil Morgana returns claim...
4,12,6631279,/m/0gffwj,Little city,1997.0,,93.0,['English Language'],['United States of America'],"['Romantic comedy', 'Ensemble Film', 'Comedy-d...",1990.0,North America,"['Drama', 'Comedy', 'Romance', 'Ensemble Film']",Adam San Franciscobased artist works cab drive...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42199,81733,23851782,/m/06_vb43,The Ghost Train,1941.0,,82.0,['English Language'],['United Kingdom'],"['Crime Fiction', 'Thriller', 'Comedy', 'Super...",1940.0,Europe,"['Fiction', 'Supernatural', 'Comedy', 'Thriller']",plot film opens Great Western express speeding...
42200,81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011.0,,120.0,['English Language'],['United States of America'],['Drama'],2010.0,North America,['Drama'],Two former National Oceanic Atmospheric Admini...
42201,81737,34980460,/m/0g4pl34,Knuckle,2011.0,,96.0,['English Language'],"['Ireland', 'United Kingdom']","['Biographical film', 'Drama', 'Documentary']",2010.0,Europe,"['Biography', 'Drama', 'Documentary']",plot film follows 12 years lives 3 Irish trave...
42202,81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992.0,,150.0,['Japanese Language'],['Japan'],"['Science Fiction', 'Japanese Movies', 'Advent...",1990.0,Asia,"['Animation', 'Science Fiction', 'Drama', 'Jap...",story takes place year 2092The Super Dimension...


In [19]:
# Group by decade
group_by_decade = df_merged.groupby('Decade')

# Create a dictionary to store DataFrames for each decade
df_decades = {decade: group for decade, group in group_by_decade}

# print("DataFrame for the 1990s:")
# df_decades[1990]

# News dataset

In [20]:
news_file_path = '../../data/df_news/'
df_news = pd.read_csv(news_file_path + 'cosine_similarity_news.csv')

In [21]:
# Create a decade column in the df_news
df_news['decade'] = (df_news['year'].dropna() // 10 * 10).astype('Int64')

In [22]:
# Identify the theme columns
non_theme_columns = ['year', 'decade']
theme_columns = [col for col in df_news.columns if col not in non_theme_columns]

# Replace cosine similarity values in theme columns
for theme in theme_columns:
    df_news[theme] = df_news[theme].apply(lambda x: 1 if x > 0.3 else 0)

In [23]:
df_news.to_csv(news_file_path + 'cosine_similarity_news_cleaned.csv', index=False)

In [24]:
import dask.dataframe as dd
df_full_news = dd.read_parquet('../../data/nyt_data.parquet')
df_full_news.to_csv(news_file_path + 'nyt_data.csv')

NameError: name 'dd' is not defined