In [26]:
# Import functions needed for this notebook
import pandas as pd
import statsmodels.api as sm
from statsmodels.genmod.families import NegativeBinomial
from statsmodels.discrete.discrete_model import NegativeBinomial as CountNB
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [27]:
# Load the dataset
df = pd.read_excel("thesis_dataset.xlsx")
df.head()

Unnamed: 0,player_name,highest_ranking,country,mother_occupation,father_occupation,sibling_info,sibling_player,titles_won,association,year_turned_pro,father_occupation_std,mother_occupation_std,father_ISEI,mother_ISEI,family_ISEI
0,Roger Federer,1,Switzerland,employee at pharmaceutical firm,executive at pharmaceutical firm,True,False,103,ATP,1998,Managing directors and chief executives,Pharmaceutical technicians and assistants,70,40,55
1,Taylor Fritz,4,USA,professional tennis player,professional tennis player,True,False,8,ATP,2015,Athletes and sports players,Athletes and sports players,46,46,46
2,Novak Djokovic,1,Serbia,entrepreneur,professional skier,True,True,99,ATP,2003,Athletes and sports players,Managing directors and chief executives,46,70,58
3,Jessica Pegula,3,USA,CEO,business owner,True,False,8,WTA,2009,Managing directors and chief executives,Managing directors and chief executives,70,70,70
4,Grigor Dimitrov,3,Bulgaria,volleyball player,tennis coach,False,False,9,ATP,2008,Athletes and sports players,Athletes and sports players,46,46,46


In [28]:
# Convert boolean columns to integers
df['sibling_info'] = df['sibling_info'].astype(int)
df['sibling_player'] = df['sibling_player'].astype(int)

df.head()

Unnamed: 0,player_name,highest_ranking,country,mother_occupation,father_occupation,sibling_info,sibling_player,titles_won,association,year_turned_pro,father_occupation_std,mother_occupation_std,father_ISEI,mother_ISEI,family_ISEI
0,Roger Federer,1,Switzerland,employee at pharmaceutical firm,executive at pharmaceutical firm,1,0,103,ATP,1998,Managing directors and chief executives,Pharmaceutical technicians and assistants,70,40,55
1,Taylor Fritz,4,USA,professional tennis player,professional tennis player,1,0,8,ATP,2015,Athletes and sports players,Athletes and sports players,46,46,46
2,Novak Djokovic,1,Serbia,entrepreneur,professional skier,1,1,99,ATP,2003,Athletes and sports players,Managing directors and chief executives,46,70,58
3,Jessica Pegula,3,USA,CEO,business owner,1,0,8,WTA,2009,Managing directors and chief executives,Managing directors and chief executives,70,70,70
4,Grigor Dimitrov,3,Bulgaria,volleyball player,tennis coach,0,0,9,ATP,2008,Athletes and sports players,Athletes and sports players,46,46,46


In [29]:
# Create the association dummy (ATP baseline, keep WTA)
df['association_WTA'] = (df['association'] == 'WTA').astype(int)

In [30]:
# Get the top 5 countries by the number of players and use them as dummy variables
top_countries = df['country'].value_counts().head(5).index.tolist()
for c in top_countries:
    df[f'country_{c.replace(" ","_")}'] = (df['country']==c).astype(int)

In [31]:
df.head()

Unnamed: 0,player_name,highest_ranking,country,mother_occupation,father_occupation,sibling_info,sibling_player,titles_won,association,year_turned_pro,...,mother_occupation_std,father_ISEI,mother_ISEI,family_ISEI,association_WTA,country_USA,country_Russia,country_Australia,country_France,country_Spain
0,Roger Federer,1,Switzerland,employee at pharmaceutical firm,executive at pharmaceutical firm,1,0,103,ATP,1998,...,Pharmaceutical technicians and assistants,70,40,55,0,0,0,0,0,0
1,Taylor Fritz,4,USA,professional tennis player,professional tennis player,1,0,8,ATP,2015,...,Athletes and sports players,46,46,46,0,1,0,0,0,0
2,Novak Djokovic,1,Serbia,entrepreneur,professional skier,1,1,99,ATP,2003,...,Managing directors and chief executives,46,70,58,0,0,0,0,0,0
3,Jessica Pegula,3,USA,CEO,business owner,1,0,8,WTA,2009,...,Managing directors and chief executives,70,70,70,1,1,0,0,0,0
4,Grigor Dimitrov,3,Bulgaria,volleyball player,tennis coach,0,0,9,ATP,2008,...,Athletes and sports players,46,46,46,0,0,0,0,0,0


#### Regression models having `titles_won` as the dependent variable

In [32]:
# Create and fit a regression model where the dependent variable is 'titles_won' and the independent variable is 'family_ISEI'
model_family_ISEI = sm.GLM.from_formula("titles_won ~ family_ISEI", data=df, family=NegativeBinomial()).fit()

# Print the summary of the model
print(model_family_ISEI.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -435.26
Date:                Sat, 07 Jun 2025   Deviance:                       125.78
Time:                        19:06:30   Pearson chi2:                     138.
No. Iterations:                     9   Pseudo R-squ. (CS):            0.03888
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       4.2489      0.502      8.458      



In [33]:
# Create and fit a regression model where the dependent variable is 'titles_won' and the independent variable is 'sibling_info'
model_sibling_info = sm.GLM.from_formula("titles_won ~ sibling_info", data=df, family=NegativeBinomial()).fit()

# Print the summary of the model
print(model_sibling_info.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -434.83
Date:                Sat, 07 Jun 2025   Deviance:                       124.92
Time:                        19:06:30   Pearson chi2:                     154.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.04713
Covariance Type:            nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        3.9416      0.319     12.345   



In [34]:
# Create and fit a regression model where the dependent variable is 'titles_won' and the independent variable is 'sibling_player'
model_sibling_player = sm.GLM.from_formula("titles_won ~ sibling_player", data=df, family=NegativeBinomial()).fit()

# Print the summary of the model
print(model_sibling_player.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -436.95
Date:                Sat, 07 Jun 2025   Deviance:                       129.17
Time:                        19:06:30   Pearson chi2:                     150.
No. Iterations:                     5   Pseudo R-squ. (CS):           0.005704
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          3.4307      0.145     23.



In [35]:
# Create and fit a regression model where the dependent variable is 'titles_won' and the independent variable is 'association_WTA'
model_association_WTA = sm.GLM.from_formula("titles_won ~ association_WTA", data=df, family=NegativeBinomial()).fit()

# Print the summary of the model
print(model_association_WTA.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -436.55
Date:                Sat, 07 Jun 2025   Deviance:                       128.37
Time:                        19:06:30   Pearson chi2:                     146.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.01365
Covariance Type:            nonrobust                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           3.2405      0.137     



In [36]:
# Create and fit a regression model where the dependent variable is `titles_won` and the independent variable is each of the top 5 countries one at a time
for country in top_countries:
    model = sm.GLM.from_formula(f"titles_won ~ country_{country.replace(' ', '_')}", data=df, family=NegativeBinomial()).fit()
    print(f"\nModel for {country}:")
    print(model.summary())


Model for USA:
                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -436.31
Date:                Sat, 07 Jun 2025   Deviance:                       127.89
Time:                        19:06:30   Pearson chi2:                     155.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.01839
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       3.2653      0.118 



In [37]:
# Sequentially add predictors to the model, one at a time
model_step1 = sm.GLM.from_formula("titles_won ~ family_ISEI", data=df, family=NegativeBinomial()).fit()



In [38]:
# Calculate the VIF for model_step1 (without constant)
print("VIF for model_step1:")
vif_step1 = pd.DataFrame({
    'Variable': model_step1.model.exog_names[1:],  # Exclude the constant
    'VIF': [variance_inflation_factor(model_step1.model.exog, i) for i in range(1, model_step1.model.exog.shape[1])]
})
print(vif_step1)

VIF for model_step1:
      Variable  VIF
0  family_ISEI  1.0


In [39]:
# Add sibling_info to the model by refitting with an updated formula
model_step2 = sm.GLM.from_formula("titles_won ~ family_ISEI + sibling_info", data=df, family=NegativeBinomial()).fit()
# Print the summary of the updated model
print("\nUpdated model with sibling_info:")
print(model_step2.summary())


Updated model with sibling_info:
                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       97
Model Family:        NegativeBinomial   Df Model:                            2
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -433.10
Date:                Sat, 07 Jun 2025   Deviance:                       121.46
Time:                        19:06:30   Pearson chi2:                     135.
No. Iterations:                     9   Pseudo R-squ. (CS):            0.07948
Covariance Type:            nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     



In [40]:
# Calculate the VIF for model_step2 (without constant)
print("VIF for model_step2:")
vif_step2 = pd.DataFrame({
    'Variable': model_step2.model.exog_names[1:],  # Exclude the constant
    'VIF': [variance_inflation_factor(model_step2.model.exog, i) for i in range(1, model_step2.model.exog.shape[1])]
})
print(vif_step2)

VIF for model_step2:
       Variable       VIF
0   family_ISEI  1.012723
1  sibling_info  1.012723


In [41]:
# Add sibling_player to the model by refitting with an updated formula
model_step3 = sm.GLM.from_formula("titles_won ~ family_ISEI + sibling_info + sibling_player", data=df, family=NegativeBinomial()).fit()
# Print the summary of the updated model
print("\nUpdated model with sibling_player:\n")
print(model_step3.summary())


Updated model with sibling_player:

                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       96
Model Family:        NegativeBinomial   Df Model:                            3
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -433.09
Date:                Sat, 07 Jun 2025   Deviance:                       121.44
Time:                        19:06:30   Pearson chi2:                     136.
No. Iterations:                    11   Pseudo R-squ. (CS):            0.07972
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Interce



In [42]:
# Calculate the VIF for model_step3 (without constant)
print("VIF for model_step3:")
vif_step3 = pd.DataFrame({
    'Variable': model_step3.model.exog_names[1:],  # Exclude the constant
    'VIF': [variance_inflation_factor(model_step3.model.exog, i) for i in range(1, model_step3.model.exog.shape[1])]
})
print(vif_step3)

VIF for model_step3:
         Variable       VIF
0     family_ISEI  1.021879
1    sibling_info  1.152581
2  sibling_player  1.140992


In [43]:
# Add association_WTA to the model by refitting with an updated formula
model_step4 = sm.GLM.from_formula("titles_won ~ family_ISEI + sibling_info + sibling_player + association_WTA", data=df, family=NegativeBinomial()).fit()
# Print the summary of the updated model
print("\nUpdated model with association_WTA:\n")
print(model_step4.summary())


Updated model with association_WTA:

                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       95
Model Family:        NegativeBinomial   Df Model:                            4
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -432.75
Date:                Sat, 07 Jun 2025   Deviance:                       120.77
Time:                        19:06:30   Pearson chi2:                     136.
No. Iterations:                    14   Pseudo R-squ. (CS):            0.08586
Covariance Type:            nonrobust                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Inte



In [44]:
# Add country dummies to the model by refitting with an updated formula
country_dummies = " + ".join([f"country_{c.replace(' ', '_')}" for c in top_countries])
model_step5 = sm.GLM.from_formula(f"titles_won ~ family_ISEI + sibling_info + sibling_player + association_WTA + {country_dummies}", data=df, family=NegativeBinomial()).fit()
# Print the summary of the final model
print("\nFinal model with country dummies:\n")
print(model_step5.summary())




Final model with country dummies:

                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       90
Model Family:        NegativeBinomial   Df Model:                            9
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -428.77
Date:                Sat, 07 Jun 2025   Deviance:                       112.81
Time:                        19:06:30   Pearson chi2:                     110.
No. Iterations:                    14   Pseudo R-squ. (CS):             0.1558
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
In

In [45]:
# Create the full model with all predictors excluding `sibling_player` from the independent variables
full_model = sm.GLM.from_formula(f"titles_won ~ family_ISEI + sibling_info + association_WTA + {country_dummies}", data=df, family=NegativeBinomial()).fit()

# Print the summary of the full model
print("\nFull model excluding sibling_player:\n")
print(full_model.summary())




Full model excluding sibling_player:

                 Generalized Linear Model Regression Results                  
Dep. Variable:             titles_won   No. Observations:                  100
Model:                            GLM   Df Residuals:                       91
Model Family:        NegativeBinomial   Df Model:                            8
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -428.81
Date:                Sat, 07 Jun 2025   Deviance:                       112.87
Time:                        19:06:30   Pearson chi2:                     111.
No. Iterations:                    14   Pseudo R-squ. (CS):             0.1552
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------

In [46]:
# Calculate the VIF for full_model (excluding constant)
vif_full = pd.DataFrame({
    'Variable': full_model.model.exog_names[1:],  # Exclude the constant
    'VIF': [variance_inflation_factor(full_model.model.exog, i) for i in range(1, full_model.model.exog.shape[1])]
})
print(vif_full)

            Variable       VIF
0        family_ISEI  1.037877
1       sibling_info  1.063542
2    association_WTA  1.031353
3        country_USA  1.156924
4     country_Russia  1.063385
5  country_Australia  1.073168
6     country_France  1.072167
7      country_Spain  1.056629


In [47]:
# Define a list of formulas, building up predictors step by step
formulas = [
    "titles_won ~ family_ISEI",
    "titles_won ~ family_ISEI + sibling_info",
    "titles_won ~ family_ISEI + sibling_info + association_WTA",
    f"titles_won ~ family_ISEI + sibling_info + association_WTA + {country_dummies}"
]

In [48]:
# Create a dictionary to collect results (model name, formula, AIC, BIC, the coefficients and standard errors for each variable)
results_summary = {
    'Model': [],
    'Formula': [],
    'AIC': [],
    'BIC': [],
    'Coef_family_ISEI': [],
    'SE_family_ISEI': [],
    'Coef_sibling_info': [],
    'SE_sibling_info': [],
    'Coef_sibling_player': [],
    'SE_sibling_player': []
}

In [49]:
# Fit each model and collect results
for idx, formula in enumerate(formulas, start=1):
    model = sm.GLM.from_formula(formula, data=df, family=NegativeBinomial()).fit()
    
    results_summary['Model'].append(f"Model {idx}")
    results_summary['Formula'].append(formula)
    results_summary['AIC'].append(model.aic)
    results_summary['BIC'].append(model.bic)
    
    # For each key variable, record coef and SE if present; otherwise NaN
    for var in ['family_ISEI', 'sibling_info', 'sibling_player']:
        if var in model.params.index:
            results_summary[f'Coef_{var}'].append(model.params[var])
            results_summary[f'SE_{var}'].append(model.bse[var])
        else:
            results_summary[f'Coef_{var}'].append(float('nan'))
            results_summary[f'SE_{var}'].append(float('nan'))



In [50]:
# Convert the results summary to a DataFrame
results_df = pd.DataFrame(results_summary)

# Display the results DataFrame
print("\nResults Summary DataFrame:")
print(results_df)


Results Summary DataFrame:
     Model                                            Formula         AIC  \
0  Model 1                           titles_won ~ family_ISEI  874.515832   
1  Model 2            titles_won ~ family_ISEI + sibling_info  872.199130   
2  Model 3  titles_won ~ family_ISEI + sibling_info + asso...  873.602354   
3  Model 4  titles_won ~ family_ISEI + sibling_info + asso...  875.611514   

          BIC  Coef_family_ISEI  SE_family_ISEI  Coef_sibling_info  \
0 -325.528282         -0.017503        0.009431                NaN   
1 -325.239815         -0.016188        0.009499          -0.646807   
2 -321.231420         -0.014516        0.009497          -0.652505   
3 -306.196409         -0.013388        0.009611          -0.743521   

   SE_sibling_info  Coef_sibling_player  SE_sibling_player  
0              NaN                  NaN                NaN  
1         0.338999                  NaN                NaN  
2         0.339153                  NaN             

#### Regression models having `highest_ranking` as dependent variable

In [52]:
# Create and fit a regression model where the dependent variable is 'highest_ranking' and the independent variable is 'family_ISEI'
model_family_ISEI_2 = sm.GLM.from_formula("highest_ranking ~ family_ISEI", data=df, family=NegativeBinomial()).fit()

# Print the summary of the model
print(model_family_ISEI_2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -277.31
Date:                Sat, 07 Jun 2025   Deviance:                       109.21
Time:                        19:07:17   Pearson chi2:                     233.
No. Iterations:                     8   Pseudo R-squ. (CS):            0.01148
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       1.0258      0.536      1.914      



In [53]:
# Create and fit a regression model where the dependent variable is 'highest_ranking' and the independent variable is 'sibling_info'
model_sibling_info_2 = sm.GLM.from_formula("highest_ranking ~ sibling_info", data=df, family=NegativeBinomial()).fit()

# Print the summary of the model
print(model_sibling_info_2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -275.27
Date:                Sat, 07 Jun 2025   Deviance:                       105.13
Time:                        19:07:54   Pearson chi2:                     230.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.05092
Covariance Type:            nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.7885      0.381      2.067   



In [54]:
# Create and fit a regression model where the dependent variable is 'highest_ranking' and the independent variable is 'sibling_player'
model_sibling_player_2 = sm.GLM.from_formula("highest_ranking ~ sibling_player", data=df, family=NegativeBinomial()).fit()

# Print the summary of the model
print(model_sibling_player_2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -277.46
Date:                Sat, 07 Jun 2025   Deviance:                       109.52
Time:                        19:08:38   Pearson chi2:                     229.
No. Iterations:                     6   Pseudo R-squ. (CS):           0.008419
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          1.5846      0.157     10.



In [55]:
# Create and fit a regression model where the dependent variable is 'highest_ranking' and the independent variable is 'association_WTA'
model_association_WTA_2 = sm.GLM.from_formula("highest_ranking ~ association_WTA", data=df, family=NegativeBinomial()).fit()

# Print the summary of the model
print(model_association_WTA_2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -270.31
Date:                Sat, 07 Jun 2025   Deviance:                       95.206
Time:                        19:09:06   Pearson chi2:                     175.
No. Iterations:                     6   Pseudo R-squ. (CS):             0.1406
Covariance Type:            nonrobust                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           1.9990      0.144     



In [56]:
# Create and fit a regression model where the dependent variable is `highest_ranking` and the independent variable is each of the top 5 countries one at a time
for country in top_countries:
    model = sm.GLM.from_formula(f"highest_ranking ~ country_{country.replace(' ', '_')}", data=df, family=NegativeBinomial()).fit()
    print(f"\nModel for {country}:")
    print(model.summary())


Model for USA:
                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:        NegativeBinomial   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -277.88
Date:                Sat, 07 Jun 2025   Deviance:                       110.35
Time:                        19:10:20   Pearson chi2:                     252.
No. Iterations:                     5   Pseudo R-squ. (CS):          7.118e-05
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       1.6973      0.126 



In [57]:
# Sequentially add predictors to the model, one at a time
model2_step1 = sm.GLM.from_formula("highest_ranking ~ family_ISEI", data=df, family=NegativeBinomial()).fit()



In [58]:
# Calculate the VIF for model2_step1 (without constant)
print("VIF for model2_step1:")
vif_step1 = pd.DataFrame({
    'Variable': model2_step1.model.exog_names[1:],  # Exclude the constant
    'VIF': [variance_inflation_factor(model2_step1.model.exog, i) for i in range(1, model2_step1.model.exog.shape[1])]
})
print(vif_step1)

VIF for model2_step1:
      Variable  VIF
0  family_ISEI  1.0


In [59]:
# Add sibling_info to the model by refitting with an updated formula
model2_step2 = sm.GLM.from_formula("highest_ranking ~ family_ISEI + sibling_info", data=df, family=NegativeBinomial()).fit()
# Print the summary of the updated model
print("\nUpdated model with sibling_info:")
print(model2_step2.summary())


Updated model with sibling_info:
                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       97
Model Family:        NegativeBinomial   Df Model:                            2
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -274.80
Date:                Sat, 07 Jun 2025   Deviance:                       104.20
Time:                        19:12:55   Pearson chi2:                     213.
No. Iterations:                     8   Pseudo R-squ. (CS):            0.05979
Covariance Type:            nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     



In [60]:
# Calculate the VIF for model2_step2 (without constant)
print("VIF for model2_step2:")
vif_step2 = pd.DataFrame({
    'Variable': model2_step2.model.exog_names[1:],  # Exclude the constant
    'VIF': [variance_inflation_factor(model2_step2.model.exog, i) for i in range(1, model2_step2.model.exog.shape[1])]
})
print(vif_step2)

VIF for model2_step2:
       Variable       VIF
0   family_ISEI  1.012723
1  sibling_info  1.012723


In [61]:
# Add sibling_player to the model by refitting with an updated formula
model2_step3 = sm.GLM.from_formula("highest_ranking ~ family_ISEI + sibling_info + sibling_player", data=df, family=NegativeBinomial()).fit()
# Print the summary of the updated model
print("\nUpdated model with sibling_player:\n")
print(model2_step3.summary())


Updated model with sibling_player:

                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       96
Model Family:        NegativeBinomial   Df Model:                            3
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -274.76
Date:                Sat, 07 Jun 2025   Deviance:                       104.12
Time:                        19:14:12   Pearson chi2:                     207.
No. Iterations:                     9   Pseudo R-squ. (CS):            0.06051
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Interce



In [62]:
# Calculate the VIF for model2_step3 (without constant)
print("VIF for model2_step3:")
vif_step3 = pd.DataFrame({
    'Variable': model2_step3.model.exog_names[1:],  # Exclude the constant
    'VIF': [variance_inflation_factor(model2_step3.model.exog, i) for i in range(1, model2_step3.model.exog.shape[1])]
})
print(vif_step3)

VIF for model2_step3:
         Variable       VIF
0     family_ISEI  1.021879
1    sibling_info  1.152581
2  sibling_player  1.140992


In [63]:
# Add association_WTA to the model by refitting with an updated formula
model2_step4 = sm.GLM.from_formula("highest_ranking ~ family_ISEI + sibling_info + sibling_player + association_WTA", data=df, family=NegativeBinomial()).fit()
# Print the summary of the updated model
print("\nUpdated model with association_WTA:\n")
print(model2_step4.summary())


Updated model with association_WTA:

                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       95
Model Family:        NegativeBinomial   Df Model:                            4
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -268.51
Date:                Sat, 07 Jun 2025   Deviance:                       91.621
Time:                        19:15:24   Pearson chi2:                     165.
No. Iterations:                     8   Pseudo R-squ. (CS):             0.1709
Covariance Type:            nonrobust                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Inte



In [64]:
# Add country dummies to the model by refitting with an updated formula
country_dummies = " + ".join([f"country_{c.replace(' ', '_')}" for c in top_countries])
model2_step5 = sm.GLM.from_formula(f"highest_ranking ~ family_ISEI + sibling_info + sibling_player + association_WTA + {country_dummies}", data=df, family=NegativeBinomial()).fit()
# Print the summary of the final model
print("\nFinal model with country dummies:\n")
print(model2_step5.summary())


Final model with country dummies:

                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       90
Model Family:        NegativeBinomial   Df Model:                            9
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -265.14
Date:                Sat, 07 Jun 2025   Deviance:                       84.867
Time:                        19:16:06   Pearson chi2:                     145.
No. Iterations:                     9   Pseudo R-squ. (CS):             0.2250
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
In



In [65]:
# Create the full model with all predictors excluding `sibling_player` from the independent variables
full_model2 = sm.GLM.from_formula(f"highest_ranking ~ family_ISEI + sibling_info + association_WTA + {country_dummies}", data=df, family=NegativeBinomial()).fit()

# Print the summary of the full model
print("\nFull model excluding sibling_player:\n")
print(full_model2.summary())


Full model excluding sibling_player:

                 Generalized Linear Model Regression Results                  
Dep. Variable:        highest_ranking   No. Observations:                  100
Model:                            GLM   Df Residuals:                       91
Model Family:        NegativeBinomial   Df Model:                            8
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -265.17
Date:                Sat, 07 Jun 2025   Deviance:                       84.943
Time:                        19:16:53   Pearson chi2:                     147.
No. Iterations:                     9   Pseudo R-squ. (CS):             0.2245
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------



In [66]:
# Calculate the VIF for full_model (excluding constant)
vif_full = pd.DataFrame({
    'Variable': full_model2.model.exog_names[1:],  # Exclude the constant
    'VIF': [variance_inflation_factor(full_model2.model.exog, i) for i in range(1, full_model2.model.exog.shape[1])]
})
print(vif_full)

            Variable       VIF
0        family_ISEI  1.037877
1       sibling_info  1.063542
2    association_WTA  1.031353
3        country_USA  1.156924
4     country_Russia  1.063385
5  country_Australia  1.073168
6     country_France  1.072167
7      country_Spain  1.056629


In [67]:
# Define a list of formulas, building up predictors step by step
formulas2 = [
    "highest_ranking ~ family_ISEI",
    "highest_ranking ~ family_ISEI + sibling_info",
    "highest_ranking ~ family_ISEI + sibling_info + association_WTA",
    f"highest_ranking ~ family_ISEI + sibling_info + association_WTA + {country_dummies}"
]

In [68]:
# Create a dictionary to collect results (model name, formula, AIC, BIC, the coefficients and standard errors for each variable)
results_summary2 = {
    'Model': [],
    'Formula': [],
    'AIC': [],
    'BIC': [],
    'Coef_family_ISEI': [],
    'SE_family_ISEI': [],
    'Coef_sibling_info': [],
    'SE_sibling_info': [],
    'Coef_sibling_player': [],
    'SE_sibling_player': []
}

In [69]:
# Fit each model and collect results
for idx, formula in enumerate(formulas2, start=1):
    model = sm.GLM.from_formula(formula, data=df, family=NegativeBinomial()).fit()

    results_summary2['Model'].append(f"Model {idx}")
    results_summary2['Formula'].append(formula)
    results_summary2['AIC'].append(model.aic)
    results_summary2['BIC'].append(model.bic)

    # For each key variable, record coef and SE if present; otherwise NaN
    for var in ['family_ISEI', 'sibling_info', 'sibling_player']:
        if var in model.params.index:
            results_summary2[f'Coef_{var}'].append(model.params[var])
            results_summary2[f'SE_{var}'].append(model.bse[var])
        else:
            results_summary2[f'Coef_{var}'].append(float('nan'))
            results_summary2[f'SE_{var}'].append(float('nan'))



In [70]:
# Convert the results summary to a DataFrame
results_df2 = pd.DataFrame(results_summary2)

# Display the results DataFrame
print("\nResults Summary DataFrame:")
print(results_df2)


Results Summary DataFrame:
     Model                                            Formula         AIC  \
0  Model 1                      highest_ranking ~ family_ISEI  558.612350   
1  Model 2       highest_ranking ~ family_ISEI + sibling_info  555.602342   
2  Model 3  highest_ranking ~ family_ISEI + sibling_info +...  545.045810   
3  Model 4  highest_ranking ~ family_ISEI + sibling_info +...  548.348862   

          BIC  Coef_family_ISEI  SE_family_ISEI  Coef_sibling_info  \
0 -342.100341          0.012621        0.010000                NaN   
1 -342.505178          0.011480        0.010073           0.947346   
2 -350.456540          0.011392        0.010203           0.674215   
3 -334.127637          0.011676        0.010433           0.703267   

   SE_sibling_info  Coef_sibling_player  SE_sibling_player  
0              NaN                  NaN                NaN  
1         0.401949                  NaN                NaN  
2         0.397316                  NaN             