In [1]:
import pandas as pd
import numpy as np
import plotly.io as pio

# Set the default template to plotly_dark
pio.templates.default = "plotly_dark"


In [2]:
actor_df = pd.read_csv('data/actor_data_for_regression.csv', index_col=0)
actor_df



Unnamed: 0_level_0,Gender,Height,Ethnicity,Age at First Release,Success Score,University,Theater,Sports,Birth City,Citizenship,...,Birth Year,Birth Month,QS University Rank,Ranked Uni,Usable Uni Rank,Specialised Drama School,Specialised Acting School,Specialised Dance School,Specialised Arts School,Birth Region
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Denzel Washington,M,1.840,African Americans,26.0,10.000000,Specialised Drama School,True,Basketball,Mount Vernon New York US,,...,1954.0,December,Not Ranked,0,3000,1,0,0,0,USA
Matt Damon,M,1.780,Scandinavian Americans,18.0,9.912690,Harvard University,True,No Sports,Cambridge Massachusetts US,,...,1970.0,October,4,1,4,0,0,0,0,USA
Tom Hanks,M,1.830,Portuguese Americans,27.0,9.741555,sub 1500 school,True,No Sports,Concord California US,"United States, Greece1",...,1956.0,July,Not Ranked,0,3000,0,0,0,0,USA
Eddie Murphy,M,1.750,African Americans,21.0,9.415060,Did not go,False,No Sports,New York City US,,...,1961.0,April,Not Ranked,0,3000,0,0,0,0,USA
Tom Cruise,M,1.700,White people,18.0,9.362255,Did not go,False,Football,Syracuse New York US,,...,1962.0,July,Not Ranked,0,3000,0,0,0,0,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lindsay Hollister,F,1.750,,30.0,1.103153,Miami University,False,No Sports,Columbus Ohio United States,,...,1977.0,June,1201,1,1201,0,0,0,0,USA
Terrence Howard,M,1.840,African Americans,26.0,0.915104,Did not go,False,Track,Chicago Illinois US,,...,1969.0,March,Not Ranked,0,3000,0,0,0,0,USA
Madonna,F,1.613,French Canadians,26.0,0.838557,Did not go,False,No Sports,Bay City Michigan US,,...,1958.0,August,Not Ranked,0,3000,0,0,0,0,USA
Evelyn Keyes,F,1.630,,72.0,0.700949,Did not go,False,No Sports,Port Arthur Texas US,,...,1916.0,November,Not Ranked,0,3000,0,0,0,0,USA


In [3]:
import plotly.express as px

fig = px.histogram(
    actor_df, 
    x='Age at First Release', 
    color='Gender', 
    nbins=50, 
    title='Age at First Movie Appearance by Gender',
    labels={'Age at First Release': 'Age'},
    opacity=0.7,
    width=800, height=600
)
fig.update_layout(barmode='overlay', xaxis_title='Age at First Release', yaxis_title='Count')

fig.show()
fig.write_html("age_first_release_vs_gender.html")


In [4]:
fig = px.histogram(
    actor_df, 
    x='Height', 
    color='Gender', 
    nbins=50, 
    title='Height by Gender',
    labels={'Height': 'Height'},
    opacity=0.7,
    width=800, height=600
)
fig.update_layout(barmode='overlay', xaxis_title='Height', yaxis_title='Count')

fig.show()
fig.write_html("height_vs_gender.html")


In [5]:
fig = px.violin(
    actor_df,
    x='Gender',
    y='Success Score',
    box=True,
    points='all',
    title='Success Score Distribution by Gender',
    color='Gender'
)
fig.update_layout(xaxis_title='Gender', yaxis_title='Success Score')
fig.show()
fig.write_html("success_score_vs_gender.html")

In [6]:
fig = px.scatter(
    actor_df,
    x="Usable Uni Rank",
    y="Success Score",
    title="Usable Uni Rank vs. Success Score",
    hover_data={"index": actor_df.index}  # Include index in hover data
)

fig.show()
fig.write_html("uni_rank_vs_score.html")

In [7]:
fig = px.scatter(
    actor_df,
    x="Age at First Release",
    y="Success Score",
    color="Gender",
    trendline="ols",  # Add a linear regression line
    title="Age at First Release vs. Success Score",
    hover_data={"index": actor_df.index}
)

fig.show()
fig.write_html("age_first_release_vs_score.html")

In [8]:


# Melt the data for specialized schools
school_columns = [
    'Specialised Drama School',
    'Specialised Acting School',
    'Specialised Dance School',
    'Specialised Arts School'
]
df_melted = actor_df.melt(
    id_vars=['Success Score'],
    value_vars=school_columns,
    var_name='School Type',
    value_name='Attendance'
)

# Filter only attended (1) schools
df_filtered = df_melted[df_melted['Attendance'] == 1]

# Violin Plot (alternative)
fig = px.violin(
    df_filtered,
    x='School Type',
    y='Success Score',
    title='Success Score Distribution by Specialized School Type',
    labels={'School Type': 'Specialized School Type'},
    color='School Type',
    box=True,  # Adds a box plot inside the violin
    points='all'  # Shows all points for better clarity
)
fig.show()

fig.write_html("school_vs_score.html")

In [9]:

fig = px.box(
    actor_df,
    x='Number of Children',
    y='Success Score',
    title='Success Score by Number of Children'
)
fig.update_layout(xaxis_title='Number of Children', yaxis_title='Success Score')
fig.show()
fig.write_html("children_vs_score.html")

In [10]:
bins = [0,10,50,100,3000]
labels = ['Top 10', 'Top 50', 'Top 100', 'Others']
actor_df['Uni_Rank_Bucket'] = pd.cut(actor_df['Usable Uni Rank'], bins=bins, labels=labels)
rank_success = actor_df.groupby('Uni_Rank_Bucket', as_index=False)['Success Score'].mean()
fig = px.bar(
    rank_success,
    x='Uni_Rank_Bucket',
    y='Success Score',
    color='Uni_Rank_Bucket',
    title='Average Success Score by University Rank Bucket'
)
fig.update_layout(xaxis_title='University Rank Bucket', yaxis_title='Avg Success Score')
fig.show()





In [11]:
fig = px.scatter(
    actor_df, 
    x='Height', 
    y='Success Score', 
    color='Ethnicity', 
    size='Success Score', 
    hover_data={"index": actor_df.index},
    title='Height vs Success Score by Ethnicity'
)
fig.update_layout(xaxis_title='Height (m)', yaxis_title='Success Score')
fig.show()

usa state analysis. 



In [12]:
month_counts = actor_df['Birth Month'].value_counts().reset_index()
month_counts.columns = ['Birth Month', 'Count']
fig = px.bar(
    month_counts,
    x='Birth Month',
    y='Count',
    color='Birth Month',
    title='Number of Actors by Birth Month',
    text='Count'
)
fig.update_layout(yaxis_title='Count')
fig.show()

In [13]:
key_features = ['Success Score', 'Age at First Release', 'Usable Uni Rank', 'Height']
df_subset = actor_df[key_features].dropna()

fig = px.scatter_matrix(
    df_subset,
    dimensions=key_features,
    title='Scatter Plot Matrix of Key Features'
)
fig.update_layout(height=800)
fig.show()

In [14]:
# Group data by 'Birth Year' and calculate the mean for numeric columns only
grouped_data = actor_df.groupby('Birth Year', as_index=False).agg({'Success Score': 'mean'})

# Plotting the corrected data
fig = px.line(
    grouped_data,
    x='Birth Year',
    y='Success Score',
    title='Average Success Score by Birth Year',
    markers=True
)

fig.update_layout(
    xaxis_title='Birth Year',
    yaxis_title='Average Success Score',
    updatemenus=[
        {
            'buttons': [
                {
                    'label': 'All Years',
                    'method': 'update',
                    'args': [{'visible': [True]}, {'title': 'Success Score by Birth Year'}],
                },
                {
                    'label': 'Custom Year Filter',
                    'method': 'update',
                    'args': [{'visible': [True]}, {'title': 'Filtered Success Score by Year'}],
                },
            ],
            'direction': 'down',
            'showactive': True,
        }
    ]
)
fig.show()
fig.write_html("birth_year_vs_score.html")


In [15]:
def groupby_states(actor_df):

    states = [
        'Alabama',
        'Alaska',
        'Arizona',
        'Arkansas',
        'California',
        'Colorado',
        'Connecticut',
        'Delaware',
        'Florida',
        'Georgia',
        'Hawaii',
        'Idaho',
        'Illinois',
        'Indiana',
        'Iowa',
        'Kansas',
        'Kentucky',
        'Louisiana',
        'Maine',
        'Maryland',
        'Massachusetts',
        'Michigan',
        'Minnesota',
        'Mississippi',
        'Missouri',
        'Montana',
        'Nebraska',
        'Nevada',
        'New Hampshire',
        'New Jersey',
        'New Mexico',
        'New York',
        'North Carolina',
        'North Dakota',
        'Ohio',
        'Oklahoma',
        'Oregon',
        'Pennsylvania',
        'Rhode Island',
        'South Carolina',
        'South Dakota',
        'Tennessee',
        'Texas',
        'Utah',
        'Vermont',
        'Virginia',
        'Washington',
        'West Virginia',
        'Wisconsin',
        'Wyoming'
        ]
    
    regions = ['USA', 'United Kingdom', 'Europe', 'nan']

    states_df = pd.DataFrame(index=actor_df.index, columns=['Birth State'])

    states_df['Birth State'] = actor_df['Birth City'].astype(str)

    for state in states:
        states_df.loc[states_df['Birth State'].str.contains(state, case=False), 'Birth State'] = state

    states_df['Birth State'] = states_df['Birth State'].apply(lambda x: x if x in states else None)

    return states_df

In [16]:
states_df = groupby_states(actor_df)
states_df['Success Score'] = actor_df['Success Score'].copy()
states_df.dropna(inplace=True)
states_df

Unnamed: 0_level_0,Birth State,Success Score
Actor name,Unnamed: 1_level_1,Unnamed: 2_level_1
Denzel Washington,New York,10.000000
Matt Damon,Massachusetts,9.912690
Tom Hanks,California,9.741555
Eddie Murphy,New York,9.415060
Tom Cruise,New York,9.362255
...,...,...
Lindsay Hollister,Ohio,1.103153
Terrence Howard,Illinois,0.915104
Madonna,Michigan,0.838557
Evelyn Keyes,Texas,0.700949


In [17]:


state_abbreviations = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

states_df["Birth State"] = states_df["Birth State"].map(state_abbreviations)
state_data = states_df.groupby("Birth State", as_index=False).agg({"Success Score": "mean"})


# Create a choropleth map
fig = px.choropleth(
    state_data,
    locations="Birth State",
    locationmode="USA-states",  # Use USA states mode
    color="Success Score",
    scope="usa",  # Restrict map to USA
    title="Average Success Score by Birth State",
    color_continuous_scale="Reds",  # You can choose other color scales
    labels={"Success Score": "Avg Success Score"}
)

# Show the map
fig.show()
fig.write_html("state_vs_score.html")