In [1]:
import numpy as np
import pandas as pd
import itertools
from collections import Counter
import json
import matplotlib.pyplot as plt
import imdb
from price_parser import Price
%matplotlib inline

#### Step 1: Preprocessing, data scraping

In [2]:
DATA_COLUMNS = ["index","wikipedia_id", "imdb_id", "title", "cast", "genres", "runtimes", "countries", "languages", "box_office", "rating", "votes", "plot", "synopsis"]

df = pd.read_csv("./Data/fetched_data.csv", names=DATA_COLUMNS, header=0, index_col=0)

In [3]:
from ast import literal_eval

df["title"] = df["title"].astype("string")
df.cast = df.cast.apply(lambda x: literal_eval(x))
df.genres = df.genres.apply(lambda x: [] if pd.isna(x) else literal_eval(x))
df["runtimes"] = pd.to_numeric(df["runtimes"].apply(lambda x: "0" if pd.isna(x) else x.replace("[", "").replace("]", "").replace("'", "")))
df.countries = df.countries.apply(lambda x: [] if pd.isna(x) else literal_eval(x))
df.languages = df.languages.apply(lambda x: [] if pd.isna(x) else literal_eval(x))
df["synopsis"] = df["synopsis"].apply(lambda x: "" if pd.isna(x) else literal_eval(x)[0])
df["plot"] = df["plot"].apply(lambda x: "" if pd.isna(x) else literal_eval(x)[0])

In [4]:
pd.to_numeric(df["box_office"][213].split(" ")[1].replace("$", "").replace(",", "").replace("'", ""))

293000

In [132]:
# TODO question assistant => comment avoir la bonne valeur de box office en dollar 
# sachant que le cours a evolue au cours du temps

def convert_currency(x):
    formated = Price.fromstring(x)
    amount = formated.amount_float
    match formated.currency:
        case "$":
            return amount
        case "EUR":
            return amount * 1.18
        case "GBP":
            return amount * 1.36
        case "SEK":
            return amount * 0.11
        case "CAD":
            return amount * 0.79
        case "INR":
            return amount * 0.013
        case "CZK":
            return amount * 0.04
        case "JPY":
            return amount * 0.009
        case _:
            return None
    

In [143]:
# lets format the box office column

import pickle
import ast

def superfunction(box_office):
    if pd.isna(box_office):
        return None, None, None
    film = ast.literal_eval(box_office)
    budget = convert_currency(film["Budget"]) if "Budget" in film else None
    bo_usa = Price.fromstring(film["Opening Weekend United States"]).amount if "Opening Weekend United States" in film else None
    bo_world = Price.fromstring(film["Cumulative Worldwide Gross"]).amount if "Cumulative Worldwide Gross" in film else None
    return budget, bo_usa, bo_world
    
formated_bo = df["box_office"].apply(lambda x: superfunction(x))
df["budget"], df["box_office_usa"], df["box_office_world"] = zip(*formated_bo)


In [159]:
years_df = pd.read_csv("./Data/movie.metadata.tsv", sep='\t', header=None)

In [167]:
years_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
59985,10933245,/m/02qvdxn,36 Deadly Styles,1980,,90.0,"{""/m/0653m"": ""Standard Mandarin"", ""/m/012w70"":...","{""/m/03h64"": ""Hong Kong"", ""/m/06f32"": ""Taiwan""}","{""/m/03q4nz"": ""World cinema"", ""/m/0hj3l_y"": ""A..."
35195,15732863,/m/03nrhj1,The Terrornauts,1967,,75.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/06n90"": ""Science Fiction""}"
39640,31156949,/m/0gh80h5,Out of Syllabus,2006,,,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","{""/m/07s9rl0"": ""Drama""}"
2302,4605877,/m/0cc84p,7G Rainbow Colony,2005-10-15,,185.0,"{""/m/07c9s"": ""Tamil Language"", ""/m/09s02"": ""Te...","{""/m/03rk0"": ""India""}","{""/m/07s9rl0"": ""Drama"", ""/m/0fztc6"": ""Tamil ci..."
5894,15382421,/m/03m6pg5,Song of Ceylon,1934,,38.0,"{""/m/02h40lc"": ""English Language""}","{""/m/06m_5"": ""Sri Lanka"", ""/m/07ssc"": ""United ...","{""/m/02hmvc"": ""Short Film"", ""/m/0jtdp"": ""Docum..."


In [168]:
import dateparser

dateparser.parse("2005-10-15").year

2005

In [234]:
# for each movies we have in df, find by id the one in years df
def parse_date(x):
    timestamped_fetched = years_df[years_df[0] == x][3].values
    year = dateparser.parse(timestamped_fetched[0]).year
    print(year)
    return year

df["release_year"] = df.wikipedia_id.apply(lambda x: parse_date(x))

2001
1988
1983
1938
1930
1974
1989
1964
1967
1930
1931
1935
1979
2001
1984
1989
1958
1984
1986
1940
1999
1955
1990
1997
2001
1980
1918
2008
1989
1996
1989
1993
1939
2007
1984
2008
2003
1976
1975
2007
1923
2003
1957
2001
1951
1968
2007
1968
2003
2008
2010
2005
1972
1986
2011
2004
2005
2009
2010
1979
1954
2012
1985
1999
1995
1997
1978
1986
1934
1984
1993
1998
1938
1991
1950
1936
2006
1971
1966
1966
1940
1991
2010
1974
2006
1971
2003
2009
1987
2003
1990
1978
2010
1950
1967
1977
1927
1940
1984
1945
1973
2007
1995
2007
1993
1997
1995
2010
1932
1978
1964
1993
2004
1998
1948
1914
1971
1970
2008
1979
2009
1933
1978
2004
1999
1952
1989
1980
1991
1938
1959
2004
1967
1991
1981
1935
2007
2000
1951
1969
1992
1997
2009
1960
1984
1961
1938
2008
2005
1998
1991
1997
1977
1944
1993
2010
1987
1999
1965
2002
1952
1994
2011
2009
1990
1989
1947
1994
1958
1952
1993
1987
1915
1965
2007
1967
1965
1957
2008
1947
1922
1965
2011
1996
1962
1942
1990
2010
2010
1935
2007
1989
1943
2007
1954
2004
1998
1927
1999
1938


In [None]:
df.to_csv("./Data/cleaned_data.csv")

#### Step 2: Investigating the evolution of movie production over time and finding main character

In [None]:
# 2.1 movie production

In [None]:
# main character


#### Step 3: Box office feature analysis

#### Step 4: Finding correlation

#### Step 5: Predicting future production and box office

#### Step 6: Final visualisation of the datastory