# Individual and gender inequality in computer science: A career study of cohorts from 1970 to 2000

## Part 4: Prediction

In this notebook, we run linear regression models using the engineered cohort, gender, early achievement, and social support features. First, in table 2, we predict whether or not an author will have dropped out at career age 15. Second, in table 3, we predict the success of an author at career age 15 for all authors and for dropouts removed.

---

### 1. Imports

Many of the custom functions we need are stored in a utilities file.

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import cross_validate, KFold
from utils import *

### 2. Load data

Load feature dataframe from the 'data' directory:

In [None]:
features = pd.read_csv('../data/features.csv.gz')

### 3. Preprocess data

Reduce observations to authors from cohorts 1970 to 2000:

In [None]:
COHORT_START_YEARS = get_start_years(1970, 2000, features)
features = features[features.career_length >= 1]
features = features[features.cohort.isin(COHORT_START_YEARS)]
print('Number of authors:', len(features))

Construct dataframe with dropouts removed:

In [None]:
features_stayed = features[features['dropout'] == False].copy()
print('Number of authors (dropouts removed):', len(features_stayed))

Get fraction of dropouts:

In [None]:
dropped_percent = features.groupby('cohort')['dropout'].sum() / features.groupby('cohort')['dropout'].count()
dropped_percent = dropped_percent.to_frame().T

dropped_percent_agg = features['dropout'].sum() / features['dropout'].count()

### 4. Linear regression

#### 4.1. Provide functions

The inner working of the predictions is stored in these functions:

In [None]:
def make_cols_lists(INCLUDE_BASELINE, INCLUDE_GENDER, INCLUDE_ACHIEVEMENT, INCLUDE_SOCIAL, REMOVE_NONE_AUTHORS, dep_var):
    categorical_cols = []
    cols_std = []
    if INCLUDE_BASELINE:
        cols_std.append('cohort')
    if INCLUDE_GENDER:
        categorical_cols.append('gender')
    if INCLUDE_ACHIEVEMENT:
        cols_std.append('productivity')
        cols_std.append('productivity_first')
        cols_std.append('impact')
        cols_std.append('top_source')
    if INCLUDE_SOCIAL:
        cols_std.append('collaboration_network')
        cols_std.append('team_size')
        cols_std.append('senior_support')
    if dep_var == 'dropout':
        categorical_cols.append(dep_var)
    else:
        cols_std.append(dep_var)
    return cols_std, categorical_cols

def prepare_data(features, cols_std, categorical_cols, REMOVE_NONE_AUTHORS, aggr=False):
    X = features[features.cohort.isin(COHORT_START_YEARS)].copy()
    if not aggr:
        for year in COHORT_START_YEARS:
            X.loc[X.cohort == year, cols_std] = scale_columns(X.loc[X.cohort == year, cols_std])
    else:
        X[cols_std] = scale_columns(X[cols_std])
    if len(categorical_cols) > 0:
        cat_cols = pd.get_dummies(X[categorical_cols]) 
        X = X[cols_std].join(cat_cols)
    else:
        X = X[cols_std]
    if REMOVE_NONE_AUTHORS:
        X.drop('gender_none', axis=1)
    X['cohort'] = features['cohort']
    return X

def run_elastic_net_aggr(features, cols_std, categorical_cols, INCLUDE_YEAR, REMOVE_NONE_AUTHORS, dep_var):
    X = prepare_data(features, cols_std, categorical_cols, REMOVE_NONE_AUTHORS, aggr=True)
    Y = X[dep_var].copy()
    X = X.drop(dep_var, axis=1)
    if not INCLUDE_YEAR:
        X = X.drop('cohort' , axis=1)
    feat_table = run_elastic_net(X, Y)
    feat_table = feat_table.set_index(0)
    if dep_var == 'dropout': 
        feat_table = feat_table.append(pd.DataFrame(index=['drop_percentage'], data=[dropped_percent_agg], columns=[1]))
    return feat_table

def run_elastic_net(X, y):
    if X.empty:
        X = pd.DataFrame(1, index=np.arange(len(y)), columns=['dummy'])
    kf = KFold(10, shuffle=True, random_state=42)
    if y.nunique() == 2:   
        y = y.astype(int)
        cv_dict = cross_validate(LogisticRegressionCV(cv=10, penalty='l2', max_iter=200), X, y, scoring=['f1_micro','f1_macro','f1_weighted','average_precision'], cv=kf, return_estimator=True, return_train_score=False)
        net_coef = pd.DataFrame([es.coef_[0] for es in cv_dict['estimator']], columns=X.columns)
        score = np.mean(cv_dict['test_f1_micro'])
        score2 = np.mean(cv_dict['test_f1_macro'])
        score3 = np.mean(cv_dict['test_f1_weighted'])
        score4 = np.mean(cv_dict['test_average_precision'])
    else:
        adj_r2_scorer = make_scorer(adjusted_r2, num_feat=X.shape[1])
        cv_dict = cross_validate(ElasticNetCV(cv=10), X, y, scoring={'r2':make_scorer(r2_score), 'neg_mean_squared_error': make_scorer(mean_squared_error), 'adj_r2': adj_r2_scorer}, cv=kf, return_estimator=True, return_train_score=False)
        net_coef = pd.DataFrame([es.coef_ for es in cv_dict['estimator']], columns=X.columns)
        score = np.mean(cv_dict['test_r2'])
        score2 = abs(np.mean(cv_dict['test_neg_mean_squared_error']))
        score3 = np.mean(cv_dict['test_adj_r2'])

    net_intercept = np.mean([es.intercept_ for es in cv_dict['estimator']])
    net_coef_mean = net_coef.mean()
    net_coef_std = net_coef.std()
    rounding = 2
    net_coef_mean_std = list(zip(np.round(net_coef_mean.values,rounding), np.round(net_coef_std.values,rounding)))
    net_coef_mean_std = [f'{x[0]}({x[1]})' for x in net_coef_mean_std]

    cohort_size = len(y)
    if y.nunique() != 2:
        net_coef_mean_std.extend([np.round(net_intercept, rounding), np.round(score, rounding), np.round(score3, rounding), np.round(score2, rounding), cohort_size])
        feat_table = pd.DataFrame(list(zip(np.append(X.columns, ['intercept', 'r2', 'adj_r2', 'neg_mean_squared_error', 'cohort_size']), net_coef_mean_std)))
    else:
        net_coef_mean_std.extend([np.round(net_intercept, rounding), np.round(score, rounding), np.round(score2, rounding), np.round(score3, rounding), np.round(score4, rounding), cohort_size])
        feat_table = pd.DataFrame(list(zip(np.append(X.columns, ['intercept', 'f1_micro','f1_macro','f1_weighted', 'avg_precision', 'cohort_size']), net_coef_mean_std)))
    return feat_table

def get_baseline_vars():
    INCLUDE_BASELINE = 1
    INCLUDE_GENDER = 0
    INCLUDE_ACHIEVEMENT = 0
    INCLUDE_SOCIAL = 0
    REMOVE_NONE_AUTHORS = 0
    return INCLUDE_BASELINE, INCLUDE_GENDER, INCLUDE_ACHIEVEMENT, INCLUDE_SOCIAL, REMOVE_NONE_AUTHORS

def get_gender_vars():
    INCLUDE_BASELINE = 1
    INCLUDE_GENDER = 1
    INCLUDE_ACHIEVEMENT = 0
    INCLUDE_SOCIAL = 0
    REMOVE_NONE_AUTHORS = 0
    return INCLUDE_BASELINE, INCLUDE_GENDER, INCLUDE_ACHIEVEMENT, INCLUDE_SOCIAL, REMOVE_NONE_AUTHORS

def get_achievement_vars():
    INCLUDE_BASELINE = 1
    INCLUDE_GENDER = 1
    INCLUDE_ACHIEVEMENT = 1
    INCLUDE_SOCIAL = 0
    REMOVE_NONE_AUTHORS = 0
    return INCLUDE_BASELINE, INCLUDE_GENDER, INCLUDE_ACHIEVEMENT, INCLUDE_SOCIAL, REMOVE_NONE_AUTHORS

def get_social_vars():
    INCLUDE_BASELINE = 1
    INCLUDE_GENDER = 1
    INCLUDE_ACHIEVEMENT = 1
    INCLUDE_SOCIAL = 1
    REMOVE_NONE_AUTHORS = 0
    return INCLUDE_BASELINE, INCLUDE_GENDER, INCLUDE_ACHIEVEMENT, INCLUDE_SOCIAL, REMOVE_NONE_AUTHORS

def elastic_agg(features, params_func, DV):
    params = params_func()
    cols_std, categorical_cols = make_cols_lists(*params, DV)
    INCLUDE_BASELINE = params[0]
    REMOVE_NONE_AUTHORS = params[4]
    res_agg = run_elastic_net_aggr(features, cols_std, categorical_cols, INCLUDE_BASELINE, REMOVE_NONE_AUTHORS, DV)
    return res_agg

def elastic_agg_all(features, DV):
    params_func_list = [get_baseline_vars, get_gender_vars, get_achievement_vars, get_social_vars]
    res_agg_list = [elastic_agg(features, params_func, DV) for params_func in params_func_list]
    res_all_agg = pd.DataFrame(index=res_agg_list[-1].index, data=[])
    res_all_agg['Baseline'] = res_agg_list[0]
    res_all_agg['Gender'] = res_agg_list[1]
    res_all_agg['Early Achievement'] = res_agg_list[2]
    res_all_agg['Social Support'] = res_agg_list[3]
    if DV == 'dropout':
        reorderlist = ['cohort',
                       'gender_f', 'gender_m', 'gender_none',
                       'productivity', 'productivity_first', 'impact', 'top_source',
                       'collaboration_network', 'senior_support', 'team_size',
                       'cohort_size', 'drop_percentage', 'intercept', 'avg_precision', 'f1_micro', 'f1_macro', 'f1_weighted']
        res_all_agg = res_all_agg.reindex(reorderlist)
        res_all_agg = res_all_agg.fillna('')
        res_all_agg['names'] = ['Cohort',
                                'Female', 'Male', 'Undetected',
                                'Productivity', 'Productivity (1st)', 'Impact', 'Top venue',
                                'Collaboration network', 'Senior support', 'Median team size',
                                'Cohort size', '% dropouts', 'Intercept', 'Average precision', 'F1 micro', 'F1 macro', 'F1 weighted']
    else:
        reorderlist = ['cohort',
                       'gender_f', 'gender_m', 'gender_none',
                       'productivity', 'productivity_first', 'impact', 'top_source',
                       'collaboration_network', 'senior_support', 'team_size',
                       'cohort_size', 'neg_mean_squared_error', 'intercept', 'r2', 'adj_r2']
        res_all_agg = res_all_agg.reindex(reorderlist)
        res_all_agg = res_all_agg.fillna('')
        res_all_agg['names'] = ['Cohort',
                                'Female', 'Male', 'Undetected',
                                'Productivity', 'Productivity (1st)', 'Impact', 'Top venue',
                                'Collaboration network', 'Senior support', 'Median team size',
                                'Cohort size', 'MSE', 'Intercept','R2', 'Adjusted R2']
    res_all_agg = res_all_agg.set_index('names')
    return res_all_agg

def results_to_latex(results, name):
    ltx_file = open(f'../results/results_{name}.tex', 'w')
    ltx_file.write('\n'.join(results.to_latex().split('\n')[5:-7]))
    ltx_file.write('\hline \n')
    ltx_file.write('\n'.join(results.to_latex().split('\n')[-7:-3]))
    ltx_file.close()

#### 4.2. Table 2: Dropout prediction

Here, we predict whether or not an author will have dropped out at career age 15:

In [None]:
dv_dropout = 'dropout'
reg_dropout = elastic_agg_all(features, dv_dropout)
results_to_latex(reg_dropout, 'reg_dropout')
reg_dropout

#### 4.3. Table 3: Success prediction

Here, we predict the success of an author at career age 15. In the paper, these two tables are combined into one:

In [None]:
# all authors
dv_success = 'success'
reg_success = elastic_agg_all(features, dv_success)
results_to_latex(reg_success, 'reg_success')
reg_success

In [None]:
# dropouts removed
reg_success_stayed = elastic_agg_all(features_stayed, dv_success)
results_to_latex(reg_success_stayed, 'reg_success_stayed')
reg_success_stayed