In [1]:
import pandas as pd
import tarfile
from tqdm import tqdm 

In [2]:
import cpi

 The idea is to find the biggest differences between men and women roles and careers in cinema. Some of the questions to be answered could be the following.
 
-Are feminine characters **described differently** as the ones played by men? This question could be answered by using natural language processing and sentiment analysis on movie summaries.

-Are women **younger than men** in cinema (by movie type/period, movie revenue)? One hypothesis is typically that men characters are older than women in romantic movies.

-Is there a **difference in the career path taken men and women** actors? For example, do women get to play in high revenue movies earlier than men?
These questions are important because society impact movies and movies have an impact on society too. Seeing how male and female characters are depicted could get us some interesting intel on the state of society. As we have data for movies that came out at different times, we could also track how women roles in movies have evolved through time.


In [2]:
#tar = tarfile.open("MovieSummaries.tar.gz")
#tar.extractall()
#tar.close()

In [3]:
df = pd.read_table("./MovieSummaries/movie.metadata.tsv", header=None)
df.columns=["Wikipedia movie ID", "Freebase movie ID", "Movie name", "Movie release date", 
            "Movie box office revenue", "Movie runtime", "Movie languages", "Movie countries", "Movie genres"]

In [4]:
df.sample(10)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres
44125,36745727,/m/0l8h5rw,Beverly Hills Chihuahua 3,2012-09-18,8666941.0,89.0,{},{},{}
18736,26798531,/m/0bmkhqd,"Job, czyli ostatnia szara komórka",2006-11-03,,92.0,"{""/m/05qqm"": ""Polish Language""}","{""/m/05qhw"": ""Poland""}","{""/m/01z4y"": ""Comedy""}"
55176,8957765,/m/027r6bl,Apasionadamente,1944,,,"{""/m/06nm1"": ""Spanish Language""}","{""/m/0jgd"": ""Argentina""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/02l7c8"": ""..."
60786,12558524,/m/02wvtzt,Some Like It Hot,1939,,65.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01z4y"": ""Comedy"", ""/m/01g6gs"": ""Black-and..."
17695,13731934,/m/03cgjr7,Buffalo Bill,1944-04-13,,90.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/03bxz7""..."
27508,29016960,/m/0dgslc5,Mark of the Phoenix,1958-11,,64.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/0lsxr"": ""Crime F..."
28356,9039938,/m/027vh_1,Las Aventuras de Dios,2000-09-01,,84.0,"{""/m/06nm1"": ""Spanish Language""}","{""/m/0jgd"": ""Argentina""}","{""/m/07s9rl0"": ""Drama"", ""/m/073_6"": ""Surrealis..."
43099,19268816,/m/04lhqgw,The Hellions,1961-11-02,,80.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa"", ""/m/07ssc"": ""Unit...","{""/m/0hfjk"": ""Western"", ""/m/07s9rl0"": ""Drama"",..."
32044,18098163,/m/04lq8j0,Donkey Punch,2008-07-18,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0219x_"": ""Indie""..."
24712,4672611,/m/0cgkjf,Teen Witch,1989-04-28,27843.0,90.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02l7c8"": ""Romance Film"", ""/m/0hj3n2s"": ""F..."


In [5]:
df1 = pd.read_table("./MovieSummaries/tvtropes.clusters.txt", header=None)

In [6]:
df1.sample(10)

Unnamed: 0,0,1
216,eccentric_mentor,"{""char"": ""Mr. Kesuke Miyagi"", ""movie"": ""The Ka..."
246,father_to_his_men,"{""char"": ""Captain Benjamin L. Willard"", ""movie..."
376,prima_donna,"{""char"": ""Lina Lamont"", ""movie"": ""Singin' in t..."
326,jerk_jock,"{""char"": ""Mike Dexter"", ""movie"": ""Can't Hardly..."
370,ophelia,"{""char"": ""Jane Olsen"", ""movie"": ""The Cabinet o..."
467,the_editor,"{""char"": ""Walter Burns"", ""movie"": ""His Girl Fr..."
259,final_girl,"{""char"": ""Ellen Ripley"", ""movie"": ""Alien"", ""id..."
7,adventurer_archaeologist,"{""char"": ""Indiana Jones"", ""movie"": ""Indiana Jo..."
114,corrupt_corporate_executive,"{""char"": ""Kingpin"", ""movie"": ""Daredevil"", ""id""..."
56,bromantic_foil,"{""char"": ""Ed"", ""movie"": ""Shaun of the Dead"", ""..."


In [7]:
df2 = pd.read_table("./MovieSummaries/character.metadata.tsv", header=None)
df2.columns=["Wikipedia movie ID", "Freebase movie ID", "Movie release date", "character name", "Actor date of birth", "Actor gender", 
            "Actor height (in meters)", "Actor ethnicity (Freebase ID)", "Actor name", "Actor age at movie release", "Freebase character/actor map ID", 
            "Freebase character ID", "Freebase actor ID"]

In [8]:
df2.sort_values(by=["Actor name"]).head(20)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
443911,22643388,/m/05znq3z,1922-08-04,,1883-12-20,F,,,Cesarine Prinz\t,,/m/0n1qm4g,,/m/0m_f7mw
300061,22643470,/m/05zqsh4,1926-10-29,,1883-12-20,F,,,Cesarine Prinz\t,,/m/0n1df4k,,/m/0m_f7mw
113180,20116200,/m/04yby6g,1998-11-05,Elias Riddler,1976-05-12,M,,,Daniel Newman,22.0,/m/09j1tjk,/m/0n4pjk3,/m/0n8vzn2
380192,15056534,/m/03h64m2,1911-05-25,,,M,,,'Big Bill' Giddinger,,/m/0lt7r5k,,/m/0lt7r5n
112577,23010798,/m/064j_6f,1973-07-25,Junkie in Casino,,M,,,'Big' Lee,,/m/0n1tc87,/m/0n1tcn4,/m/0n1tc8b
378588,1873017,/m/062rrj,1972-06-14,,,,,,'Chicken' Holleman,,/m/0gcg1gr,,/m/0gc4cy_
276570,10050502,/m/02q05nf,1975,,,M,,,'Crazy George' Henderson,,/m/0jsc3dw,,/m/0jsc3dz
346471,3866200,/m/0b40yx,1991-03-18,Ronnie,,,,,'Evil' Ted Smith,,/m/0h6dfjq,/m/0h6dfj3,/m/0h6dfj0
356950,31935934,/m/0dc2ctx,2010-08-07,Ovidiu,,M,,,'Kecske' Molnár Levente,,/m/0fq5m94,/m/0g4tsxd,/m/0fq5m86
81615,24051101,/m/07kd398,1943-05-28,Paj Mab's Guard #2,1903-11-28,M,,,'King Kong' Kashey,39.0,/m/0n637w5,/m/0n638tg,/m/0n637w8


In [9]:
df2.iloc[450664, 5]

'F'

In [10]:
df_madame = df2[df2["Actor gender"] == "F"]
df_madame = df_madame.drop_duplicates(subset=["Freebase actor ID"], keep = False)
print("Mean age of female actor at release date: " + str(df_madame["Actor age at movie release"].mean()))

Mean age of female actor at release date: 29.379171030974113


In [11]:
df_monsieur = df2[df2["Actor gender"] == "M"]
df_monsieur = df_monsieur.drop_duplicates(subset=["Freebase actor ID"], keep = False)
print("Mean age of male actor at release date: " + str(df_monsieur["Actor age at movie release"].mean()))

Mean age of male actor at release date: 36.539201118783325


In [12]:
#Création du df utile pour les calculs
df_plus = pd.DataFrame(df2["Actor gender"])
df_plus["Actor name"] = df2["Actor name"]
df_plus = df_plus.dropna(subset=['Actor gender'])

df_plus["Actor age at release date"] = df2["Actor age at movie release"]
df_plus = df_plus.dropna(subset=['Actor age at release date'])

df_plus["Freebase movie ID"] = df2["Freebase movie ID"]
df_plus = pd.merge(df_plus, df[["Freebase movie ID", "Movie release date", "Movie box office revenue", "Movie countries", "Movie name"]], on="Freebase movie ID")
df_plus = df_plus.dropna(subset=['Movie box office revenue'])

#keep only US films
df_plus = df_plus[df_plus["Movie countries"].str.contains("United States of America")]
df_plus["Movie release date"] = pd.to_datetime(df_plus["Movie release date"])
df_plus["Movie release date"] = df_plus["Movie release date"].dt.year

In [15]:
cpi.update()

#compute the box office revenue takning into account the inflation (converted to 2012)
df_plus["Real box office revenue (2012)"] = df_plus.apply(lambda x: cpi.inflate(x["Movie box office revenue"], x["Movie release date"], to=2012), axis=1)

In [16]:
df_plus.sample(10)

Unnamed: 0,Actor gender,Actor name,Actor age at release date,Freebase movie ID,Movie release date,Movie box office revenue,Movie countries,Movie name,Real box office revenue (2012)
24031,M,Burt Young,50.0,/m/0140g4,1990,119946358.0,"{""/m/09c7w0"": ""United States of America""}",Rocky V,210703600.0
242698,M,Vincent Regan,44.0,/m/05szq8z,2010,493214993.0,"{""/m/09c7w0"": ""United States of America"", ""/m/...",Clash of the Titans,519312500.0
154543,M,Anthony Edwards,41.0,/m/02d6rv,2004,28283637.0,"{""/m/0f8l9c"": ""France"", ""/m/09c7w0"": ""United S...",Thunderbirds,34376670.0
79128,M,Kevin McCarthy,70.0,/m/02qr3k8,1984,10004817.0,"{""/m/09c7w0"": ""United States of America""}",Terror in the Aisles,22108240.0
152426,M,Patton Oswalt,37.0,/m/0b9qwr,2006,128406887.0,"{""/m/09c7w0"": ""United States of America""}",Failure to Launch,146237400.0
138497,M,Everett McGill,49.0,/m/0hqs9,1995,104324083.0,"{""/m/09c7w0"": ""United States of America""}",Under Siege 2: Dark Territory,157166600.0
154154,M,Philip Baker Hall,68.0,/m/01hqhm,1999,48451803.0,"{""/m/09c7w0"": ""United States of America""}",Magnolia,66772170.0
205906,M,Nicholas Hoult,21.0,/m/0cd2vh9,2011,353624124.0,"{""/m/09c7w0"": ""United States of America""}",X-Men: First Class,360942200.0
244399,M,Basil Hoffman,42.0,/m/0qm98,1980,54766923.0,"{""/m/09c7w0"": ""United States of America""}",Ordinary People,152599000.0
264883,M,Adam Pascal,27.0,/m/01915h,1998,24299569.0,"{""/m/09c7w0"": ""United States of America""}",SLC Punk!,34227210.0


In [52]:
pd.to_datetime(df_plus.iloc[1, 7]).year

2001

In [20]:
pd.to_datetime(df_plus.iloc[10,7])

Timestamp('2001-08-24 00:00:00')

In [None]:
print("Moyenne d'age: " +str(df_plus[df_plus["Actor gender"] == "F"]["Actor age at release date"].mean()) + "\n" + 
      "Nombre de personnes: " + str(len(df_plus[df_plus["Actor gender"] == "F"])))

In [None]:
print("Moyenne d'age: " +str(df_plus[df_plus["Actor gender"] == "M"]["Actor age at release date"].mean()) + "\n" + 
      "Nombre de personnes: " + str(len(df_plus[df_plus["Actor gender"] == "M"])))

In [None]:
df_plus['Movie countries'].value_counts()