In [1]:
from tqdm import tqdm
import pickle as pkl
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
tqdm.pandas()
nltk.download('vader_lexicon')
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats as sts
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as sts
from sklearn.feature_selection import mutual_info_regression
import dash
from dash import dcc, html
import plotly.express as px
from dash.dependencies import Input, Output


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
DATA_FOLDER = 'data/MovieSummaries/'
ADDITIONAL_FOLDER = 'data/AdditionalData/'

In [3]:
# reading a txt file and convert it to a dataframe
def read_txt(path):
    df = pd.read_csv(path, sep='\t', header=None)
    return df

In [4]:
# importing the data
plots = pd.read_csv(DATA_FOLDER + 'plot_summaries.txt', header=None, sep="\t")
movies = pd.read_csv(DATA_FOLDER + 'movie.metadata.tsv', header=None, sep="\t")
names = pd.read_csv(DATA_FOLDER + 'name.clusters.txt', header=None, sep="\t")
tvtropes = pd.read_csv(DATA_FOLDER + 'tvtropes.clusters.txt', header=None, sep="\t")

In [5]:
col_names = ['wikipedia_movie_id', 'freebase_movie_id', 'release_date', 'character_name', 'date_of_birth', 'gender', 'height', 'ethnicity_id', 'name', 'age_at_release', 'freebase_character_map_id', 'freebase_character_id', 'freebase_actor_id']

characters = pd.read_csv(DATA_FOLDER + 'character.metadata.tsv', sep='\t', header=None,  names=col_names)
characters.head()

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,release_date,character_name,date_of_birth,gender,height,ethnicity_id,name,age_at_release,freebase_character_map_id,freebase_character_id,freebase_actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [6]:
with open(ADDITIONAL_FOLDER+"ethnicity_tag_category_map", 'rb') as f:
    category_tag_map = pkl.load(f)
with open(ADDITIONAL_FOLDER+"actor_movie_ethnicity_map", 'rb') as f:
    actor_movie_ethnicity_map = pkl.load(f)

In [7]:
category_tag_map

Unnamed: 0,wiki_ids,category
/m/0dryh9k,Q862086,[indian]
/m/0x67,Q49085,[african american]
/m/041rx,Q7325,[jews]
/m/02w7gg,Q42406,[english]
/m/033tf_,Q1075293,[irish american]
...,...,...
/m/01gby2,Q36692,[zhuang]
/m/03x_fq7,Q12060728,[aboriginal australians]
/m/01hphz,Q925034,[indian diaspora]
/m/033fjj,Q1537653,[ho chunk]


In [8]:

actor_movie_ethnicity_map

Unnamed: 0,freebase_actor_id,wikipedia_movie_id,ethnicity_id
2,/m/01vw26l,975900,/m/0x67
5,/m/0418ft,975900,/m/0x67
11,/m/03ydsb,975900,/m/064b9n
27,/m/01lntp,3196793,/m/0x67
55,/m/0gz5hs,2314463,/m/041rx
...,...,...,...
450627,/m/0428bc,23687925,/m/0xnvg
450631,/m/03c6v3,30553937,/m/033tf_
450643,/m/02pn4z4,12476867,/m/041rx
450644,/m/03swmf,12476867,/m/041rx


In [40]:
actor_movie_ethnicity_map['ethnic_category'] = np.nan
actor_movie_ethnicity_map['ethnic_category'] = actor_movie_ethnicity_map['ethnic_category'].astype('object')
for idx in category_tag_map.index:
    category = category_tag_map.loc[idx]['category']
    idx_locations = actor_movie_ethnicity_map['ethnicity_id'] == idx
    
    for location in actor_movie_ethnicity_map[idx_locations].index:
        actor_movie_ethnicity_map.at[location, 'ethnic_category'] = category


In [41]:
actor_movie_ethnicity_map

Unnamed: 0,freebase_actor_id,wikipedia_movie_id,ethnicity_id,ethnic_category
2,/m/01vw26l,975900,/m/0x67,[african american]
5,/m/0418ft,975900,/m/0x67,[african american]
11,/m/03ydsb,975900,/m/064b9n,[omaha ( native american )]
27,/m/01lntp,3196793,/m/0x67,[african american]
55,/m/0gz5hs,2314463,/m/041rx,[jews]
...,...,...,...,...
450627,/m/0428bc,23687925,/m/0xnvg,[italian american]
450631,/m/03c6v3,30553937,/m/033tf_,[irish american]
450643,/m/02pn4z4,12476867,/m/041rx,[jews]
450644,/m/03swmf,12476867,/m/041rx,[jews]


In [46]:
actor_id_gender_map = characters[['freebase_actor_id', 'gender']].drop_duplicates()
actor_movie_inclusivity_map = pd.merge(actor_id_gender_map, actor_movie_ethnicity_map, how='right', on='freebase_actor_id')
actor_movie_inclusivity_map

Unnamed: 0,freebase_actor_id,gender,wikipedia_movie_id,ethnicity_id,ethnic_category
0,/m/01vw26l,M,975900,/m/0x67,[african american]
1,/m/0418ft,F,975900,/m/0x67,[african american]
2,/m/03ydsb,M,975900,/m/064b9n,[omaha ( native american )]
3,/m/01lntp,M,3196793,/m/0x67,[african american]
4,/m/0gz5hs,M,2314463,/m/041rx,[jews]
...,...,...,...,...,...
99839,/m/0428bc,M,23687925,/m/0xnvg,[italian american]
99840,/m/03c6v3,F,30553937,/m/033tf_,[irish american]
99841,/m/02pn4z4,F,12476867,/m/041rx,[jews]
99842,/m/03swmf,M,12476867,/m/041rx,[jews]


In [62]:
percentage = actor_movie_inclusivity_map.isna().sum() /actor_movie_inclusivity_map.count() * 100
percentage.name = 'Percentage of NaNs'
percentage

freebase_actor_id     0.000000
gender                0.045091
wikipedia_movie_id    0.000000
ethnicity_id          0.000000
ethnic_category       0.000000
Name: Percentage of NaNs, dtype: float64

In [90]:
len_actor_ethnic = actor_movie_inclusivity_map.ethnic_category.apply(len)
strange_categories = actor_movie_inclusivity_map.ethnic_category[len_actor_ethnic[  (len_actor_ethnic > 2)].index].drop_duplicates().values
actor_movie_inclusivity_map = actor_movie_inclusivity_map[~actor_movie_inclusivity_map['ethnic_category'].isin(strange_categories)]
strange_categories

array([], dtype=object)

In [107]:
import pandas as pd
import plotly.express as px

expanded_df = actor_movie_inclusivity_map.explode('ethnic_category')

# Count the number of actors in each ethnicity
ethnicity_counts = expanded_df['ethnic_category'].value_counts().apply(np.log).reset_index()
ethnicity_counts.columns = ['ethnic_category', 'Count']

# Create a bar chart
fig = px.bar(ethnicity_counts, x='ethnic_category', y='Count', 
             title='Representation of Different Ethnicities in the Film Industry',
             labels={'Count': 'Log Number of Actors', 'Ethnicity': 'Ethnicity'})
fig.show()

In [94]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go

# Assuming df is your DataFrame and it has a column 'Ethnicity' with lists of ethnicities

# Flatten the list of ethnicities and create a DataFrame
all_ethnicities = pd.DataFrame([(actor, ethnicity) for actor, ethnicities in actor_movie_inclusivity_map['ethnic_category'].items() for ethnicity in ethnicities], columns=['Actor', 'Ethnicity'])
all_ethnicities

Unnamed: 0,Actor,Ethnicity
0,0,african american
1,1,african american
2,2,omaha ( native american )
3,3,african american
4,4,jews
...,...,...
105416,99839,italian american
105417,99840,irish american
105418,99841,jews
105419,99842,jews


In [95]:
# Create a network graph
G = nx.Graph()

# Add nodes and edges
for _, row in all_ethnicities.iterrows():
    G.add_node(row['Ethnicity'])

    # Add edges if there is similarity between ethnicities (this part needs customization based on your data)
    for other_ethnicity in G.nodes:
        if other_ethnicity in row['Ethnicity'] or row['Ethnicity'] in other_ethnicity:
            G.add_edge(row['Ethnicity'], other_ethnicity)

In [101]:
pos = nx.spring_layout(G)

# Create edges for the plot
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

# Create nodes for the plot
node_x = []
node_y = []
for node in pos:
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

# Create Plotly Trace for edges
edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines')

# Create Plotly Trace for nodes
node_trace = go.Scatter(
    x=node_x, 
    y=node_y, 
    mode='markers', 
    hoverinfo='text', 
    marker=dict(
        showscale=True, 
        colorscale='YlGnBu', 
        size=10, 
        color=list(range(len(G))), 
        colorbar=dict(
            thickness=15, 
            title='Number of Actors with this direct ethnicity', 
            xanchor='left', 
            titleside='right'
        ), 
        line_width=2
    ),
    text=[node for node in G.nodes],  # Node names for hover text
    hovertext=[node for node in G.nodes]  # Alternate way to set hover text
)

# Create the figure
fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout(
    showlegend=False, 
    hovermode='closest', 
    margin=dict(b=20,l=5,r=5,t=40),
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
))

# Set title
fig.update_layout(title='Network Graph of Ethnicity Similarities in Film Industry')

# Show the figure
fig.show()


In [None]:
with open(f'{ADDITIONAL_FOLDER}movie_scored_thresh_75.pkl',"rb") as f:
    movies = pkl.load(f)