# Cleaning up movies meta data 

In [81]:
!pip install pandas 



In [82]:
import pandas as pd
import ast

# Step 1: Reading the tsv file 
file_path = "../../MovieSummaries/movie.metadata.tsv"
df = pd.read_csv(file_path, sep='\t', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [83]:
# Step 2: Setting column names
df.columns = [
    'Wikipedia Movie ID',
    'Freebase Movie ID',
    'Movie Name',
    'Release Date',
    'Box Office Revenue',
    'Runtime',
    'Languages',
    'Countries',
    'Genres'
]

In [84]:
# Step 3: Separate the Freebase codes and names for Languages, Countries, and Genres
def split_freebase_data(data):
    try:
        parsed_data = ast.literal_eval(data)  
        if len(parsed_data) == 1:  # If there is only one item in the dictionary
            freebase_id = list(parsed_data.keys())[0]
            name = list(parsed_data.values())[0]
            return freebase_id, name
        else: # If there is 2+ items in the dictionary, return in list format
            freebase_ids = [key for key in parsed_data]
            names = [value for value in parsed_data.values()]
            return freebase_ids, names
    except:
        return [], []

In [85]:
# Step 4: Apply the split function to Languages, Countries, and Genres columns
df[['Language Freebase ID', 'Language Name']] = df['Languages'].apply(lambda x: pd.Series(split_freebase_data(x)))
df[['Country Freebase ID', 'Country Name']] = df['Countries'].apply(lambda x: pd.Series(split_freebase_data(x)))
df[['Genre Freebase ID', 'Genre Name']] = df['Genres'].apply(lambda x: pd.Series(split_freebase_data(x)))

In [91]:
# Step 5: Dropping original columns (Languages, Countries, Genres)
df_cleaned = df.drop(columns=['Languages', 'Countries', 'Genres'])

In [92]:
# Step 6: Display the cleaned data
df_cleaned.head()

Unnamed: 0,Wikipedia Movie ID,Freebase Movie ID,Movie Name,Release Date,Box Office Revenue,Runtime,Language Freebase ID,Language Name,Country Freebase ID,Country Name,Genre Freebase ID,Genre Name
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,/m/02h40lc,English Language,/m/09c7w0,United States of America,"[/m/01jfsb, /m/06n90, /m/03npn, /m/03k9fj, /m/...","[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,/m/02h40lc,English Language,/m/09c7w0,United States of America,"[/m/02n4kr, /m/03bxz7, /m/07s9rl0, /m/0hj3n01]","[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,/m/05f_3,Norwegian Language,/m/05b4w,Norway,"[/m/0lsxr, /m/07s9rl0]","[Crime Fiction, Drama]"
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,/m/02h40lc,English Language,/m/07ssc,United Kingdom,"[/m/01jfsb, /m/0glj9q, /m/09blyk]","[Thriller, Erotic thriller, Psychological thri..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,/m/04306rv,German Language,/m/0345h,Germany,/m/07s9rl0,Drama
