# Getting Started

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Load data and dataset modification

In [5]:
DATA_FOLDER = "../../data/"

PREPROCESSED_CHARACTER_DATASET = DATA_FOLDER+"preprocessed_character_metadata.tsv"
MOVIE_DATASET = DATA_FOLDER+"movie.metadata.tsv"

character_metadata = pd.read_csv(PREPROCESSED_CHARACTER_DATASET, sep='\t')
movies_column_names = ['Wikipedia_Movie_ID','Freebase_movie_ID','Movie_name','Movie_release_date','Movie_box_office_revenue','Movie_runtime','Movie_languages','Movie_countries','Movie_genres']
movies = pd.read_csv(MOVIE_DATASET, sep='\t', header=None,names=movies_column_names, decimal='.' )
movies.head()

Unnamed: 0,Wikipedia_Movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [6]:
len(movies)

81741

In [15]:
# find the line with Wikipedia_Movie_ID = 2957178 in movies

movies[movies['Wikipedia_Movie_ID'] == 2957178]

Unnamed: 0,Wikipedia_Movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
30910,2957178,/m/08g295,About Adam,2001-05-09,,97.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/06cvj"": ""Romantic comedy"", ""/m/0219x_"": ""..."


In [7]:
unique_values_movies = movies['Wikipedia_Movie_ID'].unique()
len(unique_values_movies)

81741

In [8]:
len(character_metadata)

50980

We want to look at the character ethnicity representation depending on the country. Therefore, we can merge the character dataset with the column from the movie dataset containing the country of each film.  

In [14]:
character_countries = pd.merge(character_metadata,movies[['Wikipedia_Movie_ID', 'Movie_countries']], on = 'Wikipedia_Movie_ID')
character_countries.sample(10)

Unnamed: 0,Wikipedia_Movie_ID,Freebase_Movie_ID,Movie_Release_Date,Character_Name,Actor_DOB,Actor_Gender,Actor_Height,Actor_Ethnicity,Actor_Name,Actor_Age_At_Movie_Release,Freebase_Character_Actor_Map_ID,Freebase_character_ID,Freebase_Actor_ID,Movie_Release_Year,Ethnicity_Label,Movie_countries
49162,422428,/m/026m09,2004-03-31,Mack Johnson,1957-01-17,M,1.88,/m/0x67,Steve Harvey,47.0,/m/040rv44,/m/0gyb75w,/m/02lhtf,2004.0,African Americans,"{""/m/09c7w0"": ""United States of America""}"
41449,7223382,/m/0kv0yx,1926-01-01,,1887-09-25,M,,/m/0x67,Spencer Bell,,/m/0h6hjym,,/m/0fq2mqb,1926.0,African Americans,"{""/m/09c7w0"": ""United States of America""}"
36489,2583472,/m/07pnlt,2005-09-23,Troy,1982-07-23,M,1.8,/m/022dp5,Paul Wesley,23.0,/m/03jt1tq,/m/0gyxtjs,/m/0fg8vz,2005.0,Polish Americans,"{""/m/09c7w0"": ""United States of America""}"
11989,594105,/m/02tgz4,2002-06-28,Murph,1968-12-16,M,1.88,/m/06v41q,Peter Dante,33.0,/m/0gb26t3,/m/0gb26t6,/m/0fx4xc,2002.0,French Americans,"{""/m/09c7w0"": ""United States of America""}"
19214,31990882,/m/0gyd7ks,2012-05-28,Finn,1992-02-09,M,1.78,/m/03bkbh,Avan Jogia,20.0,/m/0gyd7kg,/m/0gyd7kj,/m/04ldxgp,2012.0,Irish people,"{""/m/09c7w0"": ""United States of America""}"
13222,1235504,/m/04kwwr,1994-06-17,Timmy Gleason,1980-08-26,M,1.7,/m/033tf_,Macaulay Culkin,13.0,/m/02vbhqx,/m/0gxwfj5,/m/01f5q5,1994.0,Irish Americans,"{""/m/09c7w0"": ""United States of America""}"
23779,13408866,/m/047vp1n,2009-09-11,Jerry,1969-10-01,M,1.73,/m/038723,Zach Galifianakis,39.0,/m/07r2ksw,/m/07ybr9m,/m/02_0d2,2009.0,Greek Americans,"{""/m/09c7w0"": ""United States of America""}"
40868,1352821,/m/04w1p1,1996-01-01,Bob Collier,1940-08-03,M,1.7,/m/042gtr,Martin Sheen,55.0,/m/02vbz4q,/m/0h0hw48,/m/0hvb2,1996.0,Spanish Americans,"{""/m/09c7w0"": ""United States of America""}"
35653,2957178,/m/08g295,2001-05-09,Simon,1969-06-16,M,,/m/03bkbh,Tommy Tiernan,31.0,/m/0gdsj3g,/m/0gdsj1w,/m/03gs9q,2001.0,Irish people,"{""/m/09c7w0"": ""United States of America"", ""/m/..."
43397,90156,/m/0mf0h,1988-01-01,,1941-06-22,M,1.7,/m/041rx,Michael Lerner,46.0,/m/0cg98ml,,/m/04z542,1988.0,Jewish people,"{""/m/09c7w0"": ""United States of America""}"


In [10]:
len(character_countries)

50980

In [11]:
character_countries.sample(10)

Unnamed: 0,Wikipedia_Movie_ID,Freebase_Movie_ID,Movie_Release_Date,Character_Name,Actor_DOB,Actor_Gender,Actor_Height,Actor_Ethnicity,Actor_Name,Actor_Age_At_Movie_Release,Freebase_Character_Actor_Map_ID,Freebase_character_ID,Freebase_Actor_ID,Movie_Release_Year,Ethnicity_Label,Movie_countries
16257,1122030,/m/047ytl,1988-07-11,Eddie Moscone,1951-09-12,M,1.76,/m/0xnvg,Joe Pantoliano,36.0,/m/02vckrd,/m/0h6f8c6,/m/01b9z4,1988.0,Italian Americans,"{""/m/09c7w0"": ""United States of America""}"
25943,2501824,/m/07j9t1,1994-09-30,,1968-12-03,M,1.905,/m/0h2138,Brendan Fraser,25.0,/m/0k6d2v,,/m/0227tr,1994.0,Czech Americans,"{""/m/09c7w0"": ""United States of America""}"
20473,8711666,/m/027fwmt,1978-10-24,Glinda the Good,1917-06-30,F,,/m/0x67,Lena Horne,61.0,/m/02tbcq6,/m/0hh3lg3,/m/01n44c,1978.0,African Americans,"{""/m/09c7w0"": ""United States of America""}"
24348,2271177,/m/06_sc3,2007-09-20,Alice,1975-12-17,F,1.74,/m/0268d21,Milla Jovovich,31.0,/m/0k2gn_,/m/02wtdcf,/m/0151ns,2007.0,Serbian Americans,"{""/m/09c7w0"": ""United States of America"", ""/m/..."
5318,24987997,/m/09gldf6,1958-05-31,Big Bad Wolf,1908-05-30,M,,/m/041rx,Mel Blanc,50.0,/m/0g99lch,/m/0hyp2gn,/m/0c5vh,1958.0,Jewish people,"{""/m/09c7w0"": ""United States of America""}"
27637,286850,/m/01q40v,1999-09-04,Chidaatma Baba,1945-05-30,M,,/m/0dryh9k,Dhritiman Chaterji,54.0,/m/0h8gxkd,/m/0h8gxkg,/m/04_lhj,1999.0,Indian,"{""/m/09c7w0"": ""United States of America"", ""/m/..."
43730,17070479,/m/0417bqy,2009-01-01,Su Dongpo,1970-07-12,M,1.77,/m/01xhh5,Lee Byung-Hun,38.0,/m/04q4ghy,/m/0h36rfv,/m/04t5vf,2009.0,Koreans,"{""/m/09c7w0"": ""United States of America"", ""/m/..."
32558,17385651,/m/047q6l5,1994-01-01,Jimmy,1970-11-06,M,1.79,/m/07hwkr,Ethan Hawke,23.0,/m/04djrbp,/m/0h28nq4,/m/015v3r,1994.0,White Americans,"{""/m/09c7w0"": ""United States of America""}"
1728,1599839,/m/05fmsx,2004-12-29,Ann Foremann,1958-11-16,F,1.68,/m/033tf_,Marg Helgenberger,46.0,/m/02h71bx,/m/02nwggx,/m/02rmfm,2004.0,Irish Americans,"{""/m/09c7w0"": ""United States of America""}"
46182,12449377,/m/02w6fd_,1994-03-18,,1928-02-29,M,1.829,/m/02w7gg,Joss Ackland,66.0,/m/0cf_ts6,,/m/03xk1_,1994.0,English people,"{""/m/09c7w0"": ""United States of America""}"


In [59]:
# count unique values in the column Movie_countries

unique_values = character_countries['Movie_countries'].unique()
print("Unique values in the column Movie_countries: ", len(unique_values))

print(unique_values)

Unique values in the column Movie_countries:  413
['{"/m/09c7w0": "United States of America"}'
 '{"/m/09c7w0": "United States of America", "/m/07ssc": "United Kingdom"}'
 '{"/m/09c7w0": "United States of America", "/m/0345h": "Germany"}'
 '{"/m/09c7w0": "United States of America", "/m/03rk0": "India", "/m/07ssc": "United Kingdom"}'
 '{"/m/09c7w0": "United States of America", "/m/0d060g": "Canada", "/m/07ssc": "United Kingdom"}'
 '{"/m/09c7w0": "United States of America", "/m/0hzlz": "South Africa"}'
 '{"/m/09c7w0": "United States of America", "/m/01pj7": "Croatia"}'
 '{"/m/09c7w0": "United States of America", "/m/0chghy": "Australia"}'
 '{"/m/09c7w0": "United States of America", "/m/0f8l9c": "France", "/m/0d060g": "Canada", "/m/07ssc": "United Kingdom", "/m/0345h": "Germany"}'
 '{"/m/09c7w0": "United States of America", "/m/07ssc": "United Kingdom", "/m/0345h": "Germany"}'
 '{"/m/09c7w0": "United States of America", "/m/0f8l9c": "France"}'
 '{"/m/0f8l9c": "France", "/m/09c7w0": "United

## Actor ethnicity in the US vs Rest of the world

We want to analyze the impact of 9/11 on the actor ethnicty within each film. In this purpose we will analyze the actor ethnicity changement befrore and after 9/11. Moreover, we expect for 9/11 to have a higher impact in the US compared to the rest of the world. Therefore, we will compare the data on those levels.

In [33]:
character = character_countries.copy()

print("length of the dataset: ", len(character))

#Separation into 2 datasets, one with only movies form the US and the other that contaisn all the other movies
character_ROW = character.loc[~character['Movie_countries'].str.contains("United States of America")]

print("length of the ROW dataset: ", len(character_ROW))

character_US = character.loc[character['Movie_countries'].str.contains("United States of America")]

print("length of the US dataset: ", len(character_US))

# #reset the index
# character_ROW = character_ROW.reset_index(drop = True)
# character_US = character_US.reset_index(drop = True)

length of the dataset:  50980
length of the ROW dataset:  0
length of the US dataset:  50980


In [32]:
# keep only the rows of character that do no contain the string "United States of America" in the Movie_countries column

test1 = character[~character['Movie_countries'].str.contains("United States of America")]

test1.head()

Unnamed: 0,Wikipedia_Movie_ID,Freebase_Movie_ID,Movie_Release_Date,Character_Name,Actor_DOB,Actor_Gender,Actor_Height,Actor_Ethnicity,Actor_Name,Actor_Age_At_Movie_Release,Freebase_Character_Actor_Map_ID,Freebase_character_ID,Freebase_Actor_ID,Movie_countries,Movie_Release_Year,Ethnicity_Label


In [23]:
character_ROW.head()

Unnamed: 0,Wikipedia_Movie_ID,Freebase_Movie_ID,Movie_Release_Date,Character_Name,Actor_DOB,Actor_Gender,Actor_Height,Actor_Ethnicity,Actor_Name,Actor_Age_At_Movie_Release,Freebase_Character_Actor_Map_ID,Freebase_character_ID,Freebase_Actor_ID,Movie_countries,Movie_Release_Year,Ethnicity_Label


In [None]:
# Count the occurrences of each ethnicity
ethnicity_top10_ROW = character_ROW['Ethnicity_Label'].value_counts().nlargest(12)
ethnicity_top10_US = character_US['Ethnicity_Label'].value_counts().nlargest(12)

# print the top 10 for each dataset

print('Top ten for ethnicity_top10_ROW is', )

# # Plot the distribution of each ethnicity
# plt.figure(figsize=(15,5))
# plt.subplot(1,2,1)
# ethnicity_top10_US.plot(kind='bar', color='skyblue')
# plt.title("Top 10 Ethnicity in the US")
# plt.xlabel("Ethnicity")
# plt.ylabel("Number of Actors")
# plt.xticks(rotation=45, ha='right')

# plt.subplot(1,2,2)
# ethnicity_top10_ROW.plot(kind='bar', color='skyblue')
# plt.title("Top 10 Ethnicity in the rest of the world")
# plt.xlabel("Ethnicity")
# plt.ylabel("Number of Actors")
# plt.xticks(rotation=45, ha='right')
# # plt.subplots_adjust(wspace=0.5)
# plt.show()


Series([], Name: count, dtype: int64)


We observe the top 10 ethnicty in the rest of the world differ from the top 10 ethnicty within the US. The African Americains is the most represented ethnicity in the US whereas in the rest of the world this is the indian. 

### The impact of 9/11

We will now deep into a reduce period of time, between 1993 and 2010 and observe the difference between ethnicities representation in the US compared to the rest of the world. 

In [None]:
#For the movies in the US we construct a dataframe with the actor ethnicity before and the actor ethnicity after 9/11
#  Filter data for movies released between 1990 and 2001
data_1993_2001_US = character_US[(character_US['Movie_Release_Year'] >= 1993) & 
                                    (character_US['Movie_Release_Year'] <= 2001)]
data_2002_2010_US = character_US[(character_US['Movie_Release_Year'] >= 2002) & 
                                    (character_US['Movie_Release_Year'] <= 2010)]

# Count the top 10 ethnicities overall
top_ethnicities_US = character_US['Actor_Ethnicity'].value_counts().nlargest(12).index

# Filter for only these top ethnicities in each subset
data_1993_2001_top_US = data_1993_2001_US[data_1993_2001_US['Actor_Ethnicity'].isin(top_ethnicities_US)]
data_2002_2010_top_US = data_2002_2010_US[data_2002_2010_US['Actor_Ethnicity'].isin(top_ethnicities_US)]

# Count occurrences for each ethnicity in each time period
ethnicity_counts_1993_2001_US = data_1993_2001_top_US['Actor_Ethnicity'].value_counts(normalize=True)
ethnicity_counts_2002_2010_US = data_2002_2010_top_US['Actor_Ethnicity'].value_counts(normalize=True)

ethnicity_new_1993_US = []
ethnicity_new_2001_US = []
for i in range(0,len(ethnicity_counts_1993_2001_US)):
    ethnicity_new_1993_US.append(freebase2label(ethnicity_counts_1993_2001_US.index[i],freebase_mapping_df))
    ethnicity_new_2001_US.append(freebase2label(ethnicity_counts_2002_2010_US.index[i],freebase_mapping_df))

ethnicity_counts_1993_2001_US.index = ethnicity_new_1993_US
ethnicity_counts_2002_2010_US.index = ethnicity_new_2001_US

ethnicity_counts_1993_2001_US = ethnicity_counts_1993_2001_US.drop('None')
ethnicity_counts_2002_2010_US = ethnicity_counts_2002_2010_US.drop('None')

# Combine into a DataFrame for plotting
ethnicity_proportions_US = pd.DataFrame({
    '1993-2001': ethnicity_counts_1993_2001_US,
    '2002-2010': ethnicity_counts_2002_2010_US
})


In [None]:
#For the movies in the rest of the world we construct a dataframe with the actor ethnicity before and the actor ethnicity after 9/11
#  Filter data for movies released between 1990 and 2001
data_1993_2001_ROW = character_ROW[(character_ROW['Movie_Release_Year'] >= 1993) & 
                                    (character_ROW['Movie_Release_Year'] <= 2001)]
data_2002_2010_ROW = character_ROW[(character_ROW['Movie_Release_Year'] >= 2002) & 
                                    (character_ROW['Movie_Release_Year'] <= 2010)]

# Count the top 10 ethnicities overall
top_ethnicities_ROW = character_ROW['Actor_Ethnicity'].value_counts().nlargest(12).index

# Filter for only these top ethnicities in each subset
data_1993_2001_top_ROW = data_1993_2001_ROW[data_1993_2001_ROW['Actor_Ethnicity'].isin(top_ethnicities_ROW)]
data_2002_2010_top_ROW = data_2002_2010_ROW[data_2002_2010_ROW['Actor_Ethnicity'].isin(top_ethnicities_ROW)]

# Count occurrences for each ethnicity in each time period
ethnicity_counts_1993_2001_ROW = data_1993_2001_top_ROW['Actor_Ethnicity'].value_counts(normalize=True)
ethnicity_counts_2002_2010_ROW = data_2002_2010_top_ROW['Actor_Ethnicity'].value_counts(normalize=True)

ethnicity_new_1993_ROW = []
ethnicity_new_2001_ROW = []
for i in range(0,len(ethnicity_counts_1993_2001_ROW)):
    ethnicity_new_1993_ROW.append(freebase2label(ethnicity_counts_1993_2001_ROW.index[i],freebase_mapping_df))
    ethnicity_new_2001_ROW.append(freebase2label(ethnicity_counts_2002_2010_ROW.index[i],freebase_mapping_df))

ethnicity_counts_1993_2001_ROW.index = ethnicity_new_1993_ROW
ethnicity_counts_2002_2010_ROW.index = ethnicity_new_2001_ROW

ethnicity_counts_1993_2001_ROW = ethnicity_counts_1993_2001_ROW.drop('None')
ethnicity_counts_2002_2010_ROW= ethnicity_counts_2002_2010_ROW.drop('None')

# Combine into a DataFrame for plotting
ethnicity_proportions_ROW = pd.DataFrame({
    '1993-2001': ethnicity_counts_1993_2001_ROW,
    '2002-2010': ethnicity_counts_2002_2010_ROW
})


In [None]:
# Plotting the rest of the world and the US 
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6))  # Adjust figsize as needed

ethnicity_proportions_US.plot(kind='bar', figsize=(12, 6), color=['skyblue', 'salmon'], ax =ax1)
ax1.set_title("Top 10 Ethnicities in Movies (1993-2001 vs 2002-2010) in US")
ax1.set_xlabel("Ethnicity")
ax1.set_ylabel("Proportion")
ax1.tick_params(axis='x', rotation=45)

ethnicity_proportions_ROW.plot(kind='bar', figsize=(12, 6), color=['skyblue', 'salmon'], ax =ax2)
ax2.set_title("Top 10 Ethnicities in Movies (1993-2001 vs 2002-2010) in the rest of the world")
ax2.set_xlabel("Ethnicity")
ax2.set_ylabel("Proportion")
ax2.tick_params(axis='x', rotation=45)

plt.subplots_adjust(hspace=1)
plt.show()

We observe there is no major changes in the top 10 actor ethnciites representation in the movies between 1993 and 2010 neither in the rest of the world or the United States.

Finally, we want to isolate the actor ethnicities that exhibit major changes before and after 9/11. Therefore, we will look at the top 10  actor ethnicites that exxhibit larger relative changes ratio. 

In [None]:
# Count the occurrences of each ethnicity in each period for rest of the world and US 
ethnicity_counts_1993_2001_US = data_1993_2001_US['Actor_Ethnicity'].value_counts()
ethnicity_counts_2002_2010_US = data_2002_2010_US['Actor_Ethnicity'].value_counts()

ethnicity_counts_1993_2001_ROW = data_1993_2001_ROW['Actor_Ethnicity'].value_counts()
ethnicity_counts_2002_2010_ROW = data_2002_2010_ROW['Actor_Ethnicity'].value_counts()

# We compute the relative changes for each ethnicites between 1991 and 2011
#Use absolute value to avoid the negative value and for subsequnet isolation of smaller and largest relative difference 
ratio_US = np.abs(ethnicity_counts_1993_2001_US - ethnicity_counts_2002_2010_US) / (ethnicity_counts_1993_2001_US)
ratio_ROW = np.abs(ethnicity_counts_1993_2001_ROW - ethnicity_counts_2002_2010_ROW) / (ethnicity_counts_1993_2001_ROW)


# Sort by the ratio to find the largest 
largest_ratios_US = ratio_US.nlargest(10)
largest_ratios_ROW = ratio_ROW.nlargest(10)

ratio_new_US = []
ratio_new_ROW = []
for i in range(0, len(largest_ratios_US)):
    ratio_new_US.append(freebase2label(largest_ratios_US.index[i],freebase_mapping_df))
    ratio_new_ROW.append(freebase2label(largest_ratios_ROW.index[i],freebase_mapping_df))
largest_ratios_ROW.index = ratio_new_ROW
largest_ratios_US.index = ratio_new_US
# Print the ethnicities with the largest relative difference
print("US: 5 Largest Ratios:\n", largest_ratios_US)
print("Rest of the world: 5 Largest Ratios:\n", largest_ratios_ROW)




We observe ratios larger than 1 meaning they exhibit significant differences between the 2 observed periods, i.e before and after 9/11. From this first analysis, we do not observe any tendency of impact of 9/11 on racism. 