In [1]:
import pandas as pd

# Load data
df = pd.read_csv('../data/tv_export_data.csv')

# Remove undetermined gender
df = df[df['profile_gender'] != 'Undetermined']

# Filter by 'Acting' job
df = df[df['known_for_department'] == 'Acting']

# Create a new column to combine race and gender
df['race_gender'] = df['profile_race'] + "_" + df['profile_gender']

# Calculate total episode count for each race and gender combination
total_episode_count = df.groupby('race_gender')['episode_count'].sum().reset_index(name='total_episode_count')

# Calculate total distinct individuals for each race and gender combination
total_distinct_individuals = df.groupby('race_gender')['cast_id'].nunique().reset_index(name='total_distinct_individuals')

# Merge two dataframes
trends = pd.merge(total_episode_count, total_distinct_individuals, on='race_gender')

# Show the trends
print(trends)


               race_gender  total_episode_count  total_distinct_individuals
0           _total__Female               981164                       19755
1             _total__Male              1988178                       36534
2             asian_Female                48132                        1705
3               asian_Male                94787                        2715
4             black_Female                47874                        1197
5               black_Male               188648                        3929
6            indian_Female                 5893                         104
7              indian_Male                10667                         274
8   latino hispanic_Female                71761                        2001
9     latino hispanic_Male               234245                        3847
10   middle eastern_Female                 5156                         193
11     middle eastern_Male               132649                        3179
12          

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load data
df = pd.read_csv('../data/tv_export_data.csv')

# Remove undetermined gender
df = df[df['profile_gender'] != 'Undetermined']

# Filter by 'Acting' job
df = df[df['known_for_department'] == 'Acting']

# Convert air_date to datetime and extract year
df['season_air_date'] = pd.to_datetime(df['season_air_date'])
df['year'] = df['season_air_date'].dt.year

# Create a new column to combine race and gender
df['race_gender'] = df['profile_race'] + "_" + df['profile_gender']

# Group data by year and race/gender, calculating total episode count and distinct individuals
grouped_df = df.groupby(['year', 'race_gender']).agg({'episode_count': 'sum', 'cast_id': 'nunique'}).reset_index()
grouped_df.rename(columns={'cast_id': 'distinct_individuals'}, inplace=True)

# Loop over each race/gender combination and run a regression for each
for race_gender in grouped_df['race_gender'].unique():
    subset = grouped_df[grouped_df['race_gender'] == race_gender]
    model = smf.ols(formula='episode_count ~ year', data=subset).fit()
    print(f"Regression results for {race_gender}:")
    print(model.summary())
    print("\n")

# Note: This will print the regression results for each race/gender combination separately.
# Each summary includes the R-squared value (which shows the proportion of the variance for the dependent variable that's explained by the independent variable), the coefficients, and other statistics.


Regression results for _total__Female:
                            OLS Regression Results                            
Dep. Variable:          episode_count   R-squared:                       0.548
Model:                            OLS   Adj. R-squared:                  0.540
Method:                 Least Squares   F-statistic:                     75.10
Date:                Mon, 31 Jul 2023   Prob (F-statistic):           2.79e-12
Time:                        11:46:35   Log-Likelihood:                -649.80
No. Observations:                  64   AIC:                             1304.
Df Residuals:                      62   BIC:                             1308.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -7

In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load data
df = pd.read_csv('../data/tv_export_data.csv')

# Remove undetermined gender
df = df[df['profile_gender'] != 'Undetermined']

# Filter by 'Acting' job
df = df[df['known_for_department'] == 'Acting']

# Convert air_date to datetime and extract year
df['season_air_date'] = pd.to_datetime(df['season_air_date'])
df['year'] = df['season_air_date'].dt.year

# Group data by year, race and gender, calculating total episode count
grouped_df = df.groupby(['year', 'profile_race', 'profile_gender']).agg({'episode_count': 'sum'}).reset_index()

# Calculate yearly total for each race
yearly_race_totals = grouped_df.groupby(['year', 'profile_race']).agg({'episode_count': 'sum'}).reset_index()

# Merge yearly race totals back to the grouped_df
grouped_df = pd.merge(grouped_df, yearly_race_totals, how='left', on=['year', 'profile_race'])

# Calculate percentage
grouped_df['percentage'] = grouped_df['episode_count_x'] / grouped_df['episode_count_y'] * 100

# Now for each subdemographic group (race_gender), calculate the yearly rate of change using regression
grouped_df['race_gender'] = grouped_df['profile_race'] + "_" + grouped_df['profile_gender']

# Loop over each race/gender combination and run a regression for each
for race_gender in grouped_df['race_gender'].unique():
    subset = grouped_df[grouped_df['race_gender'] == race_gender]
    model = smf.ols(formula='percentage ~ year', data=subset).fit()
    print(f"Regression results for {race_gender}:")
    print(model.summary())
    print("\n")


Regression results for _total__Female:
                            OLS Regression Results                            
Dep. Variable:             percentage   R-squared:                       0.419
Model:                            OLS   Adj. R-squared:                  0.409
Method:                 Least Squares   F-statistic:                     44.62
Date:                Mon, 31 Jul 2023   Prob (F-statistic):           7.67e-09
Time:                        11:50:52   Log-Likelihood:                -172.59
No. Observations:                  64   AIC:                             349.2
Df Residuals:                      62   BIC:                             353.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -