In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
# loading imdb actor information
actors_raw = pd.read_csv('data/name.basics.tsv/data.tsv', sep='\t')

# filter out rows with missing data cruical for analysis
actors_raw = actors_raw[actors_raw["birthYear"] != r"\N"]

# loading imdb's principals
principals_raw = pd.read_csv('data/title.principals.tsv/data.tsv', sep='\t')

In [12]:
print("Number of actors:", actors_raw.size)
actors_raw

Number of actors: 3411282


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0072308,tt0045537"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0037382,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0056404,tt0057345,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0080455,tt0077975,tt0078723,tt0072562"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0060827,tt0083922,tt0050976,tt0050986"
...,...,...,...,...,...,...
12148488,nm9993436,Frank J. Gaily,1915,2008,sound_department,tt0189339
12148570,nm9993526,Ben Ray Lujan,1972,\N,,"tt0476038,tt7516996,tt15385660,tt4209386"
12148578,nm9993535,Henry Lawfull,2006,\N,actor,"tt10187208,tt5900600"
12148699,nm9993675,Ebrahim Alkazi,1925,2020,,\N


In [13]:
#print("Number of movies considered in 'principals' data set", len(list(dict.fromkeys(principals["tconst"]))))
principals_raw

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N
...,...,...,...,...,...,...
53573728,tt9916880,4,nm10535738,actress,\N,"[""Horrid Henry""]"
53573729,tt9916880,5,nm0996406,director,principal director,\N
53573730,tt9916880,6,nm1482639,writer,\N,\N
53573731,tt9916880,7,nm2586970,writer,books,\N


## Filter for leading actors and add column for gender
Filter for leading actors based on imdb's principals data which contains for each movie the most important people (i. e. actors, actress, directors etc.). The importance is given by imdb and stored in the 'ordering' column

In [14]:
# filter for actors and actress in principals
principals = principals_raw[(principals_raw["category"] == "actress") | (principals_raw["category"] == "actor")] 

# add gender column (0=male, 1=female)
principals["gender"] = 0
principals.loc[principals["category"] == "actress", "gender"] = 1

# select ony important columns of principal and actors data set and join them left
principal_actors = principals[["tconst", "ordering", "nconst", "gender"]].merge(actors_raw[["nconst",	"primaryName",	"birthYear", "deathYear"]], how="left", on="nconst")

principal_actors

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  principals["gender"] = 0


Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,deathYear
0,tt0000005,1,nm0443482,0,Charles Kayser,1878,1966
1,tt0000005,2,nm0653042,0,John Ott,1850,1931
2,tt0000007,1,nm0179163,0,James J. Corbett,1866,1933
3,tt0000007,2,nm0183947,0,Peter Courtney,1867,1896
4,tt0000008,1,nm0653028,0,Fred Ott,1860,1936
...,...,...,...,...,...,...,...
20986815,tt9916856,4,nm10538647,1,,,
20986816,tt9916880,1,nm1483166,0,,,
20986817,tt9916880,2,nm0254176,1,,,
20986818,tt9916880,3,nm0286175,0,,,


In [15]:
# Find total number of movies played by every leading actor
principal_actors.drop(columns=["deathYear"], inplace=True)
unique_actors = principal_actors["nconst"].unique()

male_actors = principal_actors[principal_actors["gender"] == 0]["nconst"].unique()
female_actors = principal_actors[principal_actors["gender"] == 1]["nconst"].unique()
print("There's a total of", len(unique_actors), "unique actors.")

print("There's a total of", len(male_actors), "male actors.")
print("There's a total of", len(female_actors), "female actors.")

There's a total of 2307790 unique actors.
There's a total of 1435557 male actors.
There's a total of 872781 female actors.


In [16]:
total_actor_experience = principal_actors.groupby(["nconst"]).count()

total_actor_experience

Unnamed: 0_level_0,tconst,ordering,gender,primaryName,birthYear
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000001,56,56,56,56,56
nm0000002,64,64,64,64,64
nm0000003,36,36,36,36,36
nm0000004,50,50,50,50,50
nm0000005,6,6,6,6,6
...,...,...,...,...,...
nm9993699,1,1,1,0,0
nm9993700,1,1,1,0,0
nm9993703,1,1,1,0,0
nm9993708,5,5,5,0,0


In [21]:
actor_to_experience = {}
for row in total_actor_experience.iterrows():
    actor_to_experience[row[0]] = row[1]["tconst"]

In [19]:
principal_actors.dropna(inplace=True)

In [20]:
principal_actors

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear
0,tt0000005,1,nm0443482,0,Charles Kayser,1878
1,tt0000005,2,nm0653042,0,John Ott,1850
2,tt0000007,1,nm0179163,0,James J. Corbett,1866
3,tt0000007,2,nm0183947,0,Peter Courtney,1867
4,tt0000008,1,nm0653028,0,Fred Ott,1860
...,...,...,...,...,...,...
20986809,tt9916852,2,nm8825009,0,Anil Altan,1990
20986810,tt9916852,3,nm5262613,1,Pelin Akil,1986
20986811,tt9916852,4,nm8690065,1,Zeynep Alkan,1998
20986812,tt9916856,1,nm3394271,1,Leni Wesselman,1985


## Add column for age of actor during start of movie

In [24]:
# load in movie data and select relevent columns: tconst and startYear
movies_raw = pd.read_csv("data/title.basics.tsv/data.tsv", sep="\t")[["tconst", "startYear", "genres"]]

# filter out rows with missing data cruical for analysis
movies_raw = movies_raw[movies_raw["startYear"] != r"\N"]

  movies_raw = pd.read_csv("data/title.basics.tsv/data.tsv", sep="\t")[["tconst", "startYear", "genres"]]


In [25]:
movies_raw

Unnamed: 0,tconst,startYear,genres
0,tt0000001,1894,"Documentary,Short"
1,tt0000002,1892,"Animation,Short"
2,tt0000003,1892,"Animation,Comedy,Romance"
3,tt0000004,1892,"Animation,Short"
4,tt0000005,1893,"Comedy,Short"
...,...,...,...
9446368,tt9916848,2010,"Action,Drama,Family"
9446369,tt9916850,2010,"Action,Drama,Family"
9446370,tt9916852,2010,"Action,Drama,Family"
9446371,tt9916856,2015,Short


In [26]:
# join movie data with actors
actor_movie_combi = principal_actors.merge(movies_raw, how="left", on="tconst")

# cast year columns to numeric in order to calculate
actor_movie_combi["startYear"] = pd.to_numeric(actor_movie_combi["startYear"]) # errors parameter for the case when date not given
actor_movie_combi["birthYear"] = pd.to_numeric(actor_movie_combi["birthYear"]) # errors parameter for the case when date not given
#actor_movie_combi["deatYear"] = pd.to_numeric(actor_movie_combi["deathYear"], errors="coerce") # errors parameter for the case when actor is not dead, then NaN

# add age_at_movie_start column
actor_movie_combi["age_at_movie_start"] = actor_movie_combi["startYear"] - actor_movie_combi["birthYear"]
print("Number of movie actor combinations:", actor_movie_combi.size)

actor_movie_combi

Number of movie actor combinations: 102268413


Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,startYear,genres,age_at_movie_start
0,tt0000005,1,nm0443482,0,Charles Kayser,1878,1893.0,"Comedy,Short",15.0
1,tt0000005,2,nm0653042,0,John Ott,1850,1893.0,"Comedy,Short",43.0
2,tt0000007,1,nm0179163,0,James J. Corbett,1866,1894.0,"Short,Sport",28.0
3,tt0000007,2,nm0183947,0,Peter Courtney,1867,1894.0,"Short,Sport",27.0
4,tt0000008,1,nm0653028,0,Fred Ott,1860,1894.0,"Documentary,Short",34.0
...,...,...,...,...,...,...,...,...,...
11363152,tt9916852,2,nm8825009,0,Anil Altan,1990,2010.0,"Action,Drama,Family",20.0
11363153,tt9916852,3,nm5262613,1,Pelin Akil,1986,2010.0,"Action,Drama,Family",24.0
11363154,tt9916852,4,nm8690065,1,Zeynep Alkan,1998,2010.0,"Action,Drama,Family",12.0
11363155,tt9916856,1,nm3394271,1,Leni Wesselman,1985,2015.0,Short,30.0


## Add column for imdb ratings

In [28]:
# load in imdb ratings
ratings_raw = pd.read_csv('data/title.ratings.tsv/data.tsv', sep='\t')

In [29]:
# add ratings to overall data set
actor_movie_combi = actor_movie_combi.merge(ratings_raw, how="left", on="tconst")

actor_movie_combi

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,startYear,genres,age_at_movie_start,averageRating,numVotes
0,tt0000005,1,nm0443482,0,Charles Kayser,1878,1893.0,"Comedy,Short",15.0,6.2,2555.0
1,tt0000005,2,nm0653042,0,John Ott,1850,1893.0,"Comedy,Short",43.0,6.2,2555.0
2,tt0000007,1,nm0179163,0,James J. Corbett,1866,1894.0,"Short,Sport",28.0,5.4,797.0
3,tt0000007,2,nm0183947,0,Peter Courtney,1867,1894.0,"Short,Sport",27.0,5.4,797.0
4,tt0000008,1,nm0653028,0,Fred Ott,1860,1894.0,"Documentary,Short",34.0,5.4,2070.0
...,...,...,...,...,...,...,...,...,...,...,...
11363152,tt9916852,2,nm8825009,0,Anil Altan,1990,2010.0,"Action,Drama,Family",20.0,,
11363153,tt9916852,3,nm5262613,1,Pelin Akil,1986,2010.0,"Action,Drama,Family",24.0,,
11363154,tt9916852,4,nm8690065,1,Zeynep Alkan,1998,2010.0,"Action,Drama,Family",12.0,,
11363155,tt9916856,1,nm3394271,1,Leni Wesselman,1985,2015.0,Short,30.0,,


## Add column for number of movies an actor has been in before start of respective movie

In [40]:
actor_movie_combi["exp_so_far"] = 0
actor_movie_combi.sort_values(by="startYear", inplace=True, ascending=True)
from collections import defaultdict
exp_so_far = defaultdict(int)
for row in actor_movie_combi.iterrows():
    actor = row[1]["nconst"]
    actor_movie_combi.loc[row[0], "exp_so_far"] = exp_so_far[actor]
    exp_so_far[actor] += 1

# # Drop rows with NA values and save to CSV
actor_movie_combi.dropna(inplace=True) 
#actor_movie_combi.drop(columns="exp_so_far", inplace=True)
actor_movie_combi.to_csv("data/actor_movie_combi.csv" , index=None)

In [42]:
actor_movie_combi

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,startYear,genres,age_at_movie_start,averageRating,numVotes,exp_so_far
9953543,tt7816420,1,nm1155956,0,Eadweard Muybridge,1830,1881.0,"Documentary,Short",51.0,5.2,464.0,0
6122800,tt1758563,1,nm1796515,0,Adolphe Le Prince,1872,1888.0,"Documentary,Short",16.0,5.5,1333.0,0
985638,tt0361921,1,nm1362928,0,Giuseppe Sacco Albanese,1872,1890.0,"Documentary,Short",18.0,5.0,1509.0,0
1098711,tt0416047,1,nm1362928,0,Giuseppe Sacco Albanese,1872,1890.0,Short,18.0,4.2,425.0,1
1098710,tt0416046,1,nm1362928,0,Giuseppe Sacco Albanese,1872,1890.0,Short,18.0,4.8,1107.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
5641272,tt15747584,1,nm0314358,1,Claudia Gerini,1971,2022.0,Drama,51.0,5.8,50.0,51
4770259,tt13760918,1,nm1335387,0,Prithviraj Sukumaran,1982,2022.0,"Drama,Thriller",40.0,5.5,790.0,92
4770260,tt13760918,2,nm1011348,0,Indrajith Sukumaran,1979,2022.0,"Drama,Thriller",43.0,5.5,790.0,53
4801103,tt13835548,2,nm3142672,0,Malachi Kirby,1989,2022.0,"Drama,Thriller",33.0,7.9,32.0,32


In [43]:
movie_df = pd.read_csv("data/actor_movie_combi.csv")

len(movie_df[movie_df["startYear"]>=2013])

868654

## Add column for number of movies per genre an actor has been in before start of respective movie

In [26]:
# Will do major genres
genres_major = ['Drama', "Action", "Romance","Comedy"]
from collections import defaultdict
actor_movie_combi["drama_exp_so_far"] = 0
actor_movie_combi["action_exp_so_far"] = 0
actor_movie_combi["romance_exp_so_far"] = 0
actor_movie_combi["comedy_exp_so_far"] = 0
drama_exp_so_far = defaultdict(int)
action_exp_so_far = defaultdict(int)
romance_exp_so_far = defaultdict(int)
comedy_exp_so_far = defaultdict(int)

for row in actor_movie_combi.iterrows():
    actor = row[1]["nconst"]
    genres = row[1]["genres"].split(",")

    if "Drama" in genres:
        drama_exp_so_far[actor] += 1
    if "Action" in genres:
        action_exp_so_far[actor] += 1
    if "Romance" in genres:
        romance_exp_so_far[actor] += 1
    if "Comedy" in genres:
        comedy_exp_so_far[actor] += 1

    actor_movie_combi.loc[row[0], "drama_exp_so_far"] = drama_exp_so_far[actor]
    actor_movie_combi.loc[row[0], "action_exp_so_far"] = action_exp_so_far[actor]
    actor_movie_combi.loc[row[0], "romance_exp_so_far"] = romance_exp_so_far[actor]
    actor_movie_combi.loc[row[0], "comedy_exp_so_far"] = comedy_exp_so_far[actor]
    
actor_movie_combi.to_csv("data/actor_movie_combi.csv", index=None)

In [33]:
actor_movie_combi = pd.read_csv("data/actor_movie_combi.csv")

In [67]:
movies_genx = pd.read_csv("./data/Task3_final.csv")

In [70]:
df= movies_genx.groupby("primaryName")[["averageRating","numVotes"]].mean().nlargest(20, columns="numVotes" )

df

Unnamed: 0_level_0,averageRating,numVotes
primaryName,Unnamed: 1_level_1,Unnamed: 2_level_1
Orto Ignatiussen,6.9,414008.5
Philip Seymour Hoffman,6.8,251652.0
Carrie Fisher,7.6,210897.666667
Chadwick Boseman,6.566667,181574.166667
Orson Bean,6.7,161472.0
Paul Walker,6.133333,147841.0
Michael Nyqvist,6.414286,101961.571429
Peggy Lipton,7.2,73277.0
James Gandolfini,6.533333,72432.0
Joe Stapleton,6.6,63779.0


array([1904., 1901., 1943., 1935., 1910., 1931., 1966., 1936., 1940.,
       1928., 1951., 1914., 1919., 1933., 1896., 1925., 1954., 1927.,
       1905., 1917., 1938., 1915., 1956., 1930., 1911., 1926., 1920.,
       1918., 1955., 1906., 1934., 1921., 1913., 1968., 1969., 1941.,
       1970., 1961., 1908., 1942., 1923., 1962., 1959., 1932., 1958.,
       1964., 1975., 1937., 1976., 1952., 1903., 1945., 1971., 1909.,
       1916., 1953., 1981., 1950., 1947., 1949., 1963., 1944., 1967.,
       1979., 1946., 1924., 1972., 1986., 1948., 1997., 1922., 1985.,
       1973., 1980., 1960., 1939., 1957., 1965., 1912., 1929., 1991.,
       1977., 1984., 1983., 1974., 1989., 1987., 1995., 1978., 1982.,
       1993., 1988., 2000., 1990., 2001., 2004., 1996., 1998., 2009.,
       1994., 1992., 2010., 2003., 2002., 2013., 2006., 2019., 2018.,
       2020., 2005., 2012., 1999., 2015., 2011., 2008., 2014., 2007.,
       2017., 2022., 2016., 2021.])