In [223]:
import pandas as pd
import sys
from datetime import datetime, date, time
import ast
import numpy as np

In [224]:
preprocessed_movies_data = pd.read_csv("../../data/MovieSummaries/movie.metadata.tsv", header= None, delimiter='\t') # Columns are separated by tabs rather than commas

In [225]:
preprocessed_movies_data.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


In [226]:
#Add headers to the dataframe and remove unnecessary columns

headers = [
    "Wikipedia movie ID",
    "Freebase movie ID",
    "Movie name",
    "Movie release date",
    "Movie box office revenue",
    "Movie runtime",
    "Movie languages (Freebase ID:name tuples)",
    "Movie countries (Freebase ID:name tuples)",
    "Movie genres (Freebase ID:name tuples)"
]

preprocessed_movies_data.columns = headers
preprocessed_movies_data.drop(columns=["Wikipedia movie ID","Freebase movie ID"], inplace=True)

len(preprocessed_movies_data)

81741

In [228]:
def extract_names(cell):
    #Transform the string of dictionaries into a list of real dictionaries with the id referring to the key and the name referring to the value
    #Then only extract the names from the dictionaries
    try:
        data_dict = ast.literal_eval(cell)
        return ", ".join(data_dict.values())
    except (ValueError, SyntaxError):
        return cell

In [229]:
#remove the id, only keep the name
preprocessed_movies_data["Movie languages"] = preprocessed_movies_data["Movie languages (Freebase ID:name tuples)"].apply(extract_names)
preprocessed_movies_data["Movie countries"] = preprocessed_movies_data["Movie countries (Freebase ID:name tuples)"].apply(extract_names)
preprocessed_movies_data["Movie genres"] = preprocessed_movies_data["Movie genres (Freebase ID:name tuples)"].apply(extract_names)
preprocessed_movies_data.drop(columns=["Movie languages (Freebase ID:name tuples)", "Movie countries (Freebase ID:name tuples)", "Movie genres (Freebase ID:name tuples)"], inplace=True)

In [None]:
#All the films that have no genre indicated are tranformed into NaN

preprocessed_movies_data['Movie genres'] = preprocessed_movies_data['Movie genres'].apply(lambda x: np.nan if x == '' else x)


In [239]:
#Then the lines with nan values in genre are removed
preprocessed_movies_data= preprocessed_movies_data.dropna(subset=["Movie genres"])
len(preprocessed_movies_data)

79447

We don't lost a lot of data by dropping the NA, so the analysis will stay pertinent

### Time Process

We only keep the year of the film and drop the movies without any date (6000 values)

In [240]:
timeprocess_movies_data = preprocessed_movies_data.dropna(subset=["Movie release date"]).copy()
print(len(timeprocess_movies_data))    


73077


Je comprend pas pourquoi les dates s affichent en .0 ??

In [241]:
def extract_year(date_str):
    if len(date_str) == 4 and date_str.isdigit():
        return date_str
    else:
        # Use pd.to_datetime for other formats and extract year
        return pd.to_datetime(date_str, errors='coerce').year

In [242]:
timeprocess_movies_data['Year'] = timeprocess_movies_data['Movie release date'].apply(extract_year)
timeprocess_movies_data.drop(columns=["Movie release date"], inplace=True)
timeprocess_movies_data.head(2)

Unnamed: 0,Movie name,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Year
0,Ghosts of Mars,14010832.0,98.0,English Language,United States of America,"Thriller, Science Fiction, Horror, Adventure, ...",2001
1,Getting Away with Murder: The JonBenét Ramsey ...,,95.0,English Language,United States of America,"Mystery, Biographical film, Drama, Crime Drama",2000


In [243]:
processed_data_path = '../../data/PreprocessedMovie/preprocessed_movie_metadata.csv'

# Sauvegarder le DataFrame transformé
timeprocess_movies_data.to_csv(processed_data_path, encoding='utf-8', errors='ignore', index=False)