# Preprocessing and cleaning
The goal of this notebook is to preprocess the `v1_movies_cleaned.csv` file obtained after features augmentation and refinement by GPT in order to be able to easely work with it.

- [Data analysis](#data-analysis)

In [14]:
import pandas as pd
import numpy as np
import re

from src.utils.helpers import convert_csv
from constants import *

## Data exploration and cleaning

In [15]:
# Load the data
movies = pd.read_csv(DATA_FOLDER + "v1_movies_cleaned.csv")
movies.head()

Unnamed: 0,title,languages,countries,genres,keywords,release_date,plot_summary,year_release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,$,['English'],['United States of America'],"['Drama', 'Comedy', 'Action', 'Thriller', 'Hei...",,1971-12-17,"Set in Hamburg, West Germany, several criminal...",1971,"""Western""","['Joe Collins', 'American bank security consul...","['Sarge', 'corrupt U.S. Army sergeant', 'value...","['Cunning', 'heroism', 'cleverness', 'survival...","['Ruthlessness', 'violence', 'greed', 'betraya...","['Heist', 'crime', 'betrayal', 'survival', 'te..."
1,"$1,000 on the Black","['Deutsch', 'Italiano']","['Germany', 'Italy']",['Western'],,1966-12-18,Johnny Liston has just been released from pris...,1966,"""Western""","['Johnny Liston', 'justice', 'redemption', 'he...","['Sartana', 'tyranny', 'betrayal', 'antagonist']","['Justice', 'redemption', 'individualism', 'pe...","['Tyranny', 'fear', 'betrayal', 'oppression']","['Revenge', 'self-discovery', 'moral conflict'..."
2,"$10,000 Blood Money",,['Russia'],"['Drama', 'Western']",,1967-01-01,Hired by a Mexican landowner to rescue his dau...,1967,"""None""",['None'],['None'],['None'],['None'],"['Betrayal', 'Greed', 'Bounty Hunter', 'Heist']"
3,"$100,000 for Ringo",['Italiano'],['Italy'],"['Drama', 'Western']","['spaghetti western', 'whipping']",1965-11-18,A stranger rides into Rainbow Valley where he'...,1965,"""None""",['None'],['None'],['None'],['None'],"['Western', 'Frontier', 'Stranger', 'Rivalry',..."
4,'68,['English'],"['United States of America', 'Hungary']","['Drama', 'Coming of age', 'Family Drama', 'Pe...",,1988-01-01,The father escaped the Soviet invasion of Buda...,1988,"""None""",['None'],['None'],['None'],['None'],"['Gay rights', 'counterculture', 'family confl..."


In [16]:
for col in movies.columns:
    print(col, type(movies[col][0]))

title <class 'str'>
languages <class 'str'>
countries <class 'str'>
genres <class 'str'>
keywords <class 'float'>
release_date <class 'str'>
plot_summary <class 'str'>
year_release_date <class 'numpy.int64'>
cold_war_side <class 'str'>
character_western_bloc_representation <class 'str'>
character_eastern_bloc_representation <class 'str'>
western_bloc_values <class 'str'>
eastern_bloc_values <class 'str'>
theme <class 'str'>


We observe that the values in the columns have all been converted to strings (except for `year_release_date`). That's because CSV files are plain text files, and they don't support complex data types like lists or dictionaries directly. Let's convert them into more convenient types.

In [17]:
convert_csv(movies).head()

Unnamed: 0,title,languages,countries,genres,keywords,release_date,plot_summary,year_release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,$,[English],[United States of America],"[Drama, Comedy, Action, Thriller, Heist, Crime...",,1971-12-17,"Set in Hamburg, West Germany, several criminal...",1971,Western,"[Joe Collins, American bank security consultan...","[Sarge, corrupt U.S. Army sergeant, values rut...","[Cunning, heroism, cleverness, survival, Antih...","[Ruthlessness, violence, greed, betrayal, Anti...","[Heist, crime, betrayal, survival, tension]"
1,"$1,000 on the Black","[Deutsch, Italiano]","[Germany, Italy]",[Western],,1966-12-18,Johnny Liston has just been released from pris...,1966,Western,"[Johnny Liston, justice, redemption, hero]","[Sartana, tyranny, betrayal, antagonist]","[Justice, redemption, individualism, personal ...","[Tyranny, fear, betrayal, oppression]","[Revenge, self-discovery, moral conflict, hero..."
2,"$10,000 Blood Money",,[Russia],"[Drama, Western]",,1967-01-01,Hired by a Mexican landowner to rescue his dau...,1967,,[None],[None],[None],[None],"[Betrayal, Greed, Bounty Hunter, Heist]"
3,"$100,000 for Ringo",[Italiano],[Italy],"[Drama, Western]","[spaghetti western, whipping]",1965-11-18,A stranger rides into Rainbow Valley where he'...,1965,,[None],[None],[None],[None],"[Western, Frontier, Stranger, Rivalry, Money]"
4,'68,[English],"[United States of America, Hungary]","[Drama, Coming of age, Family Drama, Period pi...",,1988-01-01,The father escaped the Soviet invasion of Buda...,1988,,[None],[None],[None],[None],"[Gay rights, counterculture, family conflict, ..."


In [18]:
movies['languages'].explode().unique()

array(['English', 'Deutsch', 'Italiano', nan, 'Spanish', 'Français',
       'Český', 'Italian', 'French', 'Dutch', 'Nederlands', 'Japanese',
       'Cantonese', 'Standard Mandarin', 'Português', 'Hindi', 'हिन्दी',
       '广州话 / 廣州話', 'Pусский', 'No', 'Español', 'Tamil', '日本語', 'German',
       'Urdu', 'Russian', 'Marathi', 'Portuguese', 'Magyar', 'Turkish',
       'Polski', 'Dansk', 'Korean', '한국어/조선말', 'Bengali',
       'Telugu language', 'Norwegian', 'Română', 'Romanian', 'Vietnamese',
       'Tiếng Việt', 'Standard Cantonese', '普通话', 'Latin',
       'Mandarin Chinese', 'Shanghainese', 'Taiwanese',
       'Chinese language', 'Czech', 'Polish', '', 'Hungarian language',
       'svenska', 'Swedish', 'Latviešu', 'Sioux language', 'Greek',
       'ελληνικά', 'Mongolian language', 'Bulgarian', 'American English',
       'বাংলা', 'Український', 'עִבְרִית', 'Danish', 'shqip', 'suomi',
       'Azərbaycan', 'Persian', 'فارسی', 'Arabic', 'Malayalam', 'Nepali',
       'தமிழ்', 'Uzbek language',

In [19]:
movies['countries'].explode().unique()

array(['United States of America', 'Germany', 'Italy', 'Russia',
       'Hungary', 'Estonia', 'Ukraine', 'Switzerland', 'Puerto Rico',
       'France', 'Egypt', 'Netherlands', 'Japan', 'United Kingdom',
       'Hong Kong', 'India', 'Spain', 'Costa Rica', 'Vietnam', 'Taiwan',
       'Australia', 'Canada', 'Latvia', 'Brazil', 'Turkey', 'Denmark',
       'Poland', 'Korea', 'Norway', 'Croatia', 'Bulgaria', 'Austria',
       'Philippines', 'Lithuania', 'Portugal', 'China', 'Romania',
       'Georgia', 'Yugoslavia', 'New Zealand', 'Czechoslovakia',
       'Argentina', 'Greece', 'South Africa', 'Luxembourg', 'Sweden',
       'Ireland', 'Colombia', 'Uruguay', 'Belgium', 'Czech Republic',
       'Bangladesh', 'Tunisia', 'Albania', 'Finland', 'Iceland',
       'Liechtenstein', 'Mexico', 'Iran', 'Zimbabwe', 'Nepal',
       'Uzbekistan', 'Venezuela', 'Bosnia and Herzegovina', 'Cuba',
       'Peru', 'Malaysia', 'Pakistan', 'Sri Lanka', 'Algeria', 'Israel',
       'Singapore', 'Morocco', 'Azerbaijan

In [20]:
movies['cold_war_side'].explode().unique()

array(['Western', 'None', 'Eastern'], dtype=object)

Even if the values for the columns `countries` and `cold_war_side` seems to be fine. We observe some inconcistencies for `languages`.

In [21]:
languages_translation = {
    '广州话/廣州話': 'Chinese',
    '广州话 / 廣州話': 'Chinese',
    '日本語': 'Japanese',
    'Japan': 'Japanese',
    '普通话': 'Chinese',
    '한국어/조선말': 'Korean',
    'ภาษาไทย': 'Thai',
    'हिन्दी': 'Indian',
    'தமிழ்': 'Indian',
    'TiếngViệt': 'Vietnamese',
    'Tiếng Việt': 'Vietnamese',
    'العربية': 'Arabic',
    'اردو': 'Indian',
    'българскиезик': 'Bulgarian',
    'Pусский': 'Russian',
    'беларускаямова': 'Belarusian',
    'Український': 'Ukrainian',
    'Srpski': 'Serbian',
    'Slovenčina': 'Slovak',
    'Français': 'French',
    'France': 'French',
    'Deutsch': 'German',
    'Italiano': 'Italian',
    'Español': 'Spanish',
    'Polski': 'Polish',
    'Standard Mandarin': 'Chinese',
    'Mandarin Chinese': 'Chinese',
    'Mandarin': 'Chinese',
    'Português': 'Portuguese',
    'Standard Cantonese': 'Chinese',
    'Cantonese': 'Chinese',
    'suomi': 'Finnish',
    'Magyar': 'Hungarian',
    'Bosanski': 'Bosnian',
    'svenska': 'Swedish',
    'ελληνικά': 'Greek',
    'Český': 'Czech',
    'Dansk': 'Danish',
    'Nederlands': 'Dutch',
    'עִבְרִית': 'Hebrew',
    'American English': 'English',
    'Türkçe': 'Turkish',
    'Tagalog': 'Filipino',
    'Khmer': 'Cambodian',
    'Hindi': 'Indian',
    'Tamil': 'Indian',
    'Telugu': 'Indian',
    'Urdu': 'Indian',
    'Oriya': 'Indian',
    'Eesti': 'Estonian',
    'Română': 'Romanian',
    'Romani': 'Romanian',
    'Norsk': 'Norwegian',
    'No': 'Norwegian',
    'Íslenska': 'Icelandic',
    'Bahasa indonesia': 'Indonesian',
    'Català': 'Spanish',
    'Inuktitut': 'Inuit',
    'Hakka': 'Chinese',
    'Sicilian': 'Italian',
    'Marathi': 'Indian',
    'Hrvatski': 'Croatian',
    'shqip': 'Albanian',
    'isiZulu': 'Zulu', 
    'Latviešu': 'Latvian',
    'ქართული': 'Georgian',
    'Australian English': 'English',
    'Bahasamelayu': 'Malay',
    'Lietuvi\\x9akai'.encode('latin1').decode('unicode_escape'): 'Lithuanian', # \x9a is an escape sequence
    'Farsi, Western': 'Persian',
    'فارسی': 'Persian',
    'беларуская мова': 'Belarusian',
    'български език': 'Bulgarian',
    'Swiss German': 'German',
    'Brazilian Portuguese': 'Portuguese',
    'euskera': 'Basque',
    'қазақ': 'Kazakh',
    'Bahasa melayu': 'Malay',
    'French Sign': 'Sign Language',
    'American Sign': 'Sign Language',
    'Hokkien': 'Chinese',
    'Min Nan': 'Chinese',
    'Chinese, Hakka': 'Chinese',
    'Ancient Greek': 'Greek',
    'Gaelic': 'Scottish Gaelic',
    'Scottish Gaelic': 'Scottish Gaelic',
    'Zulu': 'Zulu',
    'Lithuanian': 'Lithuanian',
    'Standard Tibetan': 'Tibetan',
    'Saami, North': 'Sami',
    'Bamanankan': 'Bambara',
    'Fulfulde, Adamawa': 'Fula',
    'Brazilian Portuguese': 'Portuguese',
    'South African English': 'English',
    'Jamaican Creole English': 'Jamaican Creole',
    'Classical Arabic': 'Arabic',
    'Frisian, Western': 'Frisian',
    'Yolngu Matha': 'Yolngu Matha',
    'Cheyenne': 'Cheyenne',
    'Crow': 'Crow',
    'Scanian': 'Swedish',
    'Palawa kani': 'Palawa kani',
    'Kiswahili': 'Swahili',
    'Māori': 'Maori',
    'বাংলা': 'Bengali',
    'తెలుగు': 'Indian',
    'Taiwanese': 'Chinese',
    'Shanghainese': 'Chinese',
    'Azərbaycan': 'Azerbaijani',
    'Cymraeg': 'Welsh',
    'Hariyani': 'Indian',
    'Slovenščina': 'Slovenian',
    'Maya, Yucatán': 'Maya',
    'Egyptian Arabic': 'Arabic',
    'Assyrian Neo-Aramaic': 'Aramaic',
    'Crow': 'Native American languages',
    'Cheyenne': 'Native American languages',
    'Hopi': 'Native American languages',
    'Pawnee': 'Native American languages',
    'Mohawk': 'Native American languages',
    'Algonquin': 'Native American languages',
    'Cree': 'Native American languages',
    'Navajo': 'Native American languages',
    'Sioux': 'Native American languages',
    'Khmer, Central': 'Cambodian'
}


In [22]:
print(len(movies['languages'].explode().unique()))

207


In [23]:
def remove_language_suffix(language_set):
    if isinstance(language_set, float):
        return np.nan
    else:
        cleaned_set = {
        re.sub(r'[\\\"\']', '',  # Remove unwanted characters
               re.sub(r'\blanguages?\b', '', lang, flags=re.IGNORECASE)  # Remove "language"/"languages"
               ).strip()
            for lang in language_set
        }
        return cleaned_set

movies['languages'] = movies['languages'].apply(remove_language_suffix)

movies['languages'] = movies['languages'].apply(lambda x: 
    set([languages_translation.get(string, string) for string in x]) if isinstance(x, set) else x)

print(len(movies['languages'].explode().unique()))
movies['languages'].explode().unique()

110


array(['English', 'German', 'Italian', nan, 'Spanish', 'French', 'Czech',
       'Dutch', 'Japanese', 'Chinese', 'Portuguese', 'Indian', 'Russian',
       'Norwegian', 'Hungarian', 'Turkish', 'Danish', 'Polish', 'Korean',
       'Bengali', 'Romanian', 'Vietnamese', 'Latin', '', 'Swedish',
       'Latvian', 'Native American languages', 'Greek', 'Mongolian',
       'Bulgarian', 'Ukrainian', 'Hebrew', 'Albanian', 'Finnish',
       'Persian', 'Azerbaijani', 'Arabic', 'Malayalam', 'Nepali', 'Uzbek',
       'Kannada', 'Malay', 'Sinhala', 'Quechua', 'Filipino', 'Serbian',
       'Cambodian', 'Georgian', 'Estonian', 'Icelandic', 'Croatian',
       'Irish', 'Serbo-Croatian', 'Punjabi', 'Bambara', 'Welsh',
       'Macedonian', 'Friulian', 'Sanskrit', 'Slovak', 'Wolof', 'Thai',
       'Yiddish', 'Afrikaans', 'Armenian', 'Hindustani', 'Sign Language',
       'Zulu', 'Belarusian', 'Silent film', 'Scottish Gaelic',
       'Slovenian', 'Fulfulde', 'Bosnian', 'Indonesian', 'Catalan',
       'Maya', 'B

In [24]:
movies['languages'] = movies['languages'].apply(lambda x: 
                                                [lang for lang in x if lang != '' and pd.notna(lang) and lang != '??????']
                                                if isinstance(x, set) else x)
print(len(movies['languages'].explode().unique()))
movies['languages'].explode().unique()

108


array(['English', 'German', 'Italian', nan, 'Spanish', 'French', 'Czech',
       'Dutch', 'Japanese', 'Chinese', 'Portuguese', 'Indian', 'Russian',
       'Norwegian', 'Hungarian', 'Turkish', 'Danish', 'Polish', 'Korean',
       'Bengali', 'Romanian', 'Vietnamese', 'Latin', 'Swedish', 'Latvian',
       'Native American languages', 'Greek', 'Mongolian', 'Bulgarian',
       'Ukrainian', 'Hebrew', 'Albanian', 'Finnish', 'Persian',
       'Azerbaijani', 'Arabic', 'Malayalam', 'Nepali', 'Uzbek', 'Kannada',
       'Malay', 'Sinhala', 'Quechua', 'Filipino', 'Serbian', 'Cambodian',
       'Georgian', 'Estonian', 'Icelandic', 'Croatian', 'Irish',
       'Serbo-Croatian', 'Punjabi', 'Bambara', 'Welsh', 'Macedonian',
       'Friulian', 'Sanskrit', 'Slovak', 'Wolof', 'Thai', 'Yiddish',
       'Afrikaans', 'Armenian', 'Hindustani', 'Sign Language', 'Zulu',
       'Belarusian', 'Silent film', 'Scottish Gaelic', 'Slovenian',
       'Fulfulde', 'Bosnian', 'Indonesian', 'Catalan', 'Maya', 'Basque',
   

In [25]:
movies.head()

Unnamed: 0,title,languages,countries,genres,keywords,release_date,plot_summary,year_release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,$,[English],[United States of America],"[Drama, Comedy, Action, Thriller, Heist, Crime...",,1971-12-17,"Set in Hamburg, West Germany, several criminal...",1971,Western,"[Joe Collins, American bank security consultan...","[Sarge, corrupt U.S. Army sergeant, values rut...","[Cunning, heroism, cleverness, survival, Antih...","[Ruthlessness, violence, greed, betrayal, Anti...","[Heist, crime, betrayal, survival, tension]"
1,"$1,000 on the Black","[German, Italian]","[Germany, Italy]",[Western],,1966-12-18,Johnny Liston has just been released from pris...,1966,Western,"[Johnny Liston, justice, redemption, hero]","[Sartana, tyranny, betrayal, antagonist]","[Justice, redemption, individualism, personal ...","[Tyranny, fear, betrayal, oppression]","[Revenge, self-discovery, moral conflict, hero..."
2,"$10,000 Blood Money",,[Russia],"[Drama, Western]",,1967-01-01,Hired by a Mexican landowner to rescue his dau...,1967,,[None],[None],[None],[None],"[Betrayal, Greed, Bounty Hunter, Heist]"
3,"$100,000 for Ringo",[Italian],[Italy],"[Drama, Western]","[spaghetti western, whipping]",1965-11-18,A stranger rides into Rainbow Valley where he'...,1965,,[None],[None],[None],[None],"[Western, Frontier, Stranger, Rivalry, Money]"
4,'68,[English],"[United States of America, Hungary]","[Drama, Coming of age, Family Drama, Period pi...",,1988-01-01,The father escaped the Soviet invasion of Buda...,1988,,[None],[None],[None],[None],"[Gay rights, counterculture, family conflict, ..."


In [26]:
movies.to_csv(DATA_FOLDER_PREPROCESSED + "preprocessed_movies.csv", index=False)