In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm

In [13]:
movie_raw = pd.read_csv('../../data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)

movie_raw.rename(columns={
    0: 'Wikipedia movie ID',
    1: 'Freebase movie ID',
    2: 'Movie name',
    3: 'Movie release date',
    4: 'Movie box office revenue',
    5: 'Movie runtime',
    6: 'Movie languages',
    7: 'Movie countries',
    8: 'Movie genres'
}, inplace=True)

# character_raw = pd.read_csv('../../data/MovieSummaries/character.metadata.tsv', sep='\t', header=None)
# plots_raw = pd.read_csv('../../data/MovieSummaries/plot_summaries.txt', sep='\t', header=None)

In [62]:
def process_languages(data, skip_languages=None):
    if skip_languages is None:
        skip_languages = set()
    skipped = 0
    new_lang_column = []
    for i, row in tqdm(data.iterrows(), total=data.shape[0]):
        new_lang_column.append([])
        if pd.isna(row['Movie languages']):
            continue
        # get the languages
        languages = eval(row['Movie languages'])
        # process them
        for lang_id, lang in languages.items():
            if lang in skip_languages:
                skipped += 1
                lang = "skip"
            lang = lang.lower()
            if "languages" in lang:
                lang = lang.replace("language", "").strip()
            elif "language" in lang:
                lang = lang.replace("language", "").strip()
            new_lang_column[-1].append(lang)
    print(f"Skipped: {skipped}")
    data["languages_human"] = new_lang_column
    return data

process_languages(
    movie_raw,
    skip_languages={"\ud801\udc16\ud801\udc32\ud801\udc49\ud801\udc4b\ud801\udc4c\ud801\udc32"}
)

100%|██████████| 81741/81741 [00:11<00:00, 7418.94it/s]


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,languages_human
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",[english]
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",[english]
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",[norwegian]
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",[english]
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",[german]
...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",[english]
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",[english]
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}",[english]
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...",[japanese]


I do not know how to properly process `\ud801\udc16\ud801\udc32\ud801\udc49\ud801\udc4b\ud801\udc4c\ud801\udc32`. 
The decoding to human-readable is 🀖🀲🀉🀋🀌🀲. It doesn't make sense. I decided to just drop the language "skip"

In [75]:
rename_countries = {
    'Democratic Republic of the Congo': ["Congo"],
    'Federal Republic of Yugoslavia': ["Yugoslavia"],
    'Socialist Federal Republic of Yugoslavia': ["Yugoslavia"],
    'German Language': ["Germany"],
    'Malayalam Language': ["India"],
    'Palestinian Territories' : ["Palestine"],
    'Palestinian territories': ["Palestine"],
    'Mandatory Palestine': ["Palestine"],
    'Iraqi Kurdistan': ["Iraq"],
    'Northern Ireland': ["Ireland"],
    'Serbia and Montenegro': ["Serbia", "Montenegro"],
    'Soviet occupation zone': ["Soviet Union"],
    'Ukranian SSR': ["Ukrainian SSR"],

}

countries_old_to_new = {
    "Czech Republic": "Czech",
    "Czechoslovakia": "Czech",
    "Yugoslavia": "Serbia",
    'Georgian SSR': "Georgia",
    "German Democratic Republic": "Germany",
    "Kingdom of Great Britain": "England",
    'Kingdom of Italy': "Italy",
    'Nazi Germany': "Germany",
    'Republic of China': "China",
    'Republic of Macedonia': "Macedonia",
    'Slovak Republic': "Slovakia",
    'South Africa': 'Africa',
    'Soviet Union': 'Russia',
    "Uzbek SSR": "Uzbekistan",
    "West Germany": "Germany",
    'Ukrainian SSR': "Ukraine",
    'Weimar Republic': "Germany",
}

We need to pre-process the countries.\
**Our renaming does not carry any political or other meaning. This is just a university project, and it should be treated accordingly.**

In [76]:
def process_countries(data):
    processed_country_column = []
    processed_country_column_new = []
    for i, row in tqdm(data.iterrows(), total=data.shape[0]):
        processed_country_column.append([])
        processed_country_column_new.append([])
        if pd.isna(row['Movie countries']):
            continue
        # get the countries
        countries = eval(row['Movie countries'])
        # process them
        for _, country in countries.items():
            for subcountry in rename_countries.get(country, [country]):
                processed_country_column[-1].append(subcountry)
                processed_country_column_new[-1].append(
                    countries_old_to_new.get(subcountry, subcountry)
                )
            
    data["countries"] = processed_country_column
    data["countries_new"] = processed_country_column_new
    return data

process_countries(
    movie_raw,
)

100%|██████████| 81741/81741 [00:31<00:00, 2573.91it/s]


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,languages_human,countries,countries_new
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",[english],[United States of America],[United States of America]
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",[english],[United States of America],[United States of America]
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",[norwegian],[Norway],[Norway]
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",[english],[United Kingdom],[United Kingdom]
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",[german],[Germany],[Germany]
...,...,...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",[english],[United States of America],[United States of America]
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",[english],"[Ireland, United Kingdom]","[Ireland, United Kingdom]"
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}",[english],[United States of America],[United States of America]
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...",[japanese],[Japan],[Japan]


In [92]:
# one turkish film has the wrong data
change_years = {'1010': '2010'}

def process_date(date_str):
    if pd.isna(date_str):
        return None
    year = next(el for el in date_str.split('-') if len(el) == 4)
    if year in change_years:
        date_str = date_str.replace(year, change_years[year])
    # it's year
    if len(date_str) == 4:
        return pd.to_datetime(date_str, format='%Y')
    elif len(date_str) == 7:
        return pd.to_datetime(date_str, format='%Y-%m')
    elif len(date_str) == 10:
        return pd.to_datetime(date_str, format='%Y-%m-%d')
    else:
        print(f"Unknown date format: {date_str}")
        return None

movie_raw['movie_release_date'] = movie_raw['Movie release date'].apply(lambda x: process_date(x))

In [132]:
import json
with open("genre_rename.json") as f:
    genre_rename = json.load(f)

def process_genres(data):
    processed_genre_column = []
    for i, row in tqdm(data.iterrows(), total=data.shape[0]):
        processed_genre_column.append(set())
        if pd.isna(row['Movie genres']):
            continue
        # get the genres
        genres = eval(row['Movie genres'])
        # process them
        for genre in genres.values():
            for genre_processed in genre_rename.get(genre, [genre]):
                processed_genre_column[-1].add(genre_processed)
        processed_genre_column[-1] = list(processed_genre_column[-1])
    data["genres"] = processed_genre_column
    return data

process_genres(movie_raw);

100%|██████████| 81741/81741 [00:13<00:00, 5870.92it/s]
