In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from IPython.display import display

import numpy as np

import plot

In [2]:
from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV, Lasso, LogisticRegressionCV, SGDClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, f1_score

from sklearn.model_selection import cross_val_score

from sklearn.utils import shuffle

In [3]:
CAREER_LENGTH = 5

#EARLY_CAREER_LEN_LIST = [1, 2, 3, 4, 5]
EARLY_CAREER_LEN_LIST = [3]
EARLY_CAREER = 3
#RECOGNITION_CUT_OFF_LIST = [3, 4, 5, 6, 7, 8, 9]
RECOGNITION_CUT_OFF_LIST = [3]
RECOGNITION_CUT = 3

MAX_CAREER_LEN = 15
END_YEAR = 2018

INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 0
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 1

# if true, all authors with gender=none are removed from cohort
REMOVE_NONE_AUTHORS = False

In [4]:
credible_authors = pd.read_csv('derived-data/authors-scientific-extended.csv')

In [5]:
credible_authors.columns
#print(credible_authors.groupby("start_year")['dropped_after_10'].agg('sum'))
#print(credible_authors.groupby("start_year")['author'].count())

Index(['author', 'start_year', 'end_year', 'total_num_pub', 'career_length',
       'max_absence-0-15', 'avg_absence-0-15', 'dropped_after_10', 'gender',
       'early_career_degree_3', 'early_career_degree_5',
       'early_career_degree_7', 'early_career_degree_9',
       'early_career_degree_11', 'early_career_degree_12',
       'early_career_qual_3', 'early_career_qual_5', 'early_career_qual_7',
       'early_career_qual_9', 'early_career_qual_11', 'early_career_qual_12',
       'early_career_recognition_EC3_RC3', 'early_career_recognition_EC5_RC5',
       'early_career_recognition_EC7_RC7', 'early_career_recognition_EC9_RC9',
       'early_career_recognition_EC11_RC11',
       'early_career_recognition_EC12_RC12', 'succ_after_15y', 'h-index_3',
       'h-index_5', 'h-index_7', 'h-index_9', 'h-index_11', 'h-index_12',
       'h-index_15', 'early_career_prod_3', 'early_career_prod_5',
       'early_career_prod_7', 'early_career_prod_9', 'early_career_prod_11',
       'early_career_p

In [6]:
credible_authors = credible_authors[credible_authors.career_length >= CAREER_LENGTH]

In [7]:
# credible_authors['citation_increase_15_3'] = credible_authors['succ_after_15y'] - credible_authors[
#     'early_career_recognition_EC3_RC3']
# credible_authors['h_index_increase_15_3'] = credible_authors['h-index_15'] - credible_authors['h-index_3']

In [8]:
EARLY_CAREER_LEN_LIST_EXT = [3,5,7,9,11,12]
RECOGNITION_CUT_OFF_LIST_EXT = [3,5,7,9,11,12]

for year in EARLY_CAREER_LEN_LIST_EXT:
    credible_authors[f'citation_increase_15_{year}'] = credible_authors['succ_after_15y'] - credible_authors[
        f'early_career_recognition_EC{year}_RC{year}']
    credible_authors[f'h_index_increase_15_{year}'] = credible_authors['h-index_15'] - credible_authors[f'h-index_{year}']

## Correlations

In [None]:
cols = ['succ_after_15y', 'h_index_increase_15_3', 'citation_increase_15_3', 'max_absence-0-15', 
        'early_career_prod_3', 'early_career_degree_3', 'early_career_coauthor_max_hindex_12', 
        'early_career_recognition_EC3_RC3', 'early_career_qual_12']

col_names_short = ['succ', 'hindex_incr', 'cit_incr', 'max_abs', 
        'prod_3', 'degree_3', 'maxh_3', 
        'rec_3', 'qual_3']


In [None]:
cor_qual = credible_authors[cols].corr(method='kendall')

In [None]:
cor_qual
#cor_qual['succ_after_15y'].sort_values()

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
cax = ax.matshow(cor_qual, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(col_names_short, rotation=45)
ax.set_yticklabels(col_names_short)
plt.show()


In [None]:
#cor = credible_authors.corr()

In [None]:
#cor_qual[f'h_index_increase_15_{EARLY_CAREER}'].sort_values()

In [None]:
# sns.heatmap(cor, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})

## Linear reg

### Test different predictors

In [None]:
# test different early career lenghts

In [None]:
year = 1995

credible_authors_1991 = credible_authors[credible_authors.start_year == year]

In [None]:
X = credible_authors_1991.copy()


In [None]:
categorical_cols = ['gender']

for col in categorical_cols:
    X = X.join(pd.get_dummies(X[categorical_cols]))

X.drop(categorical_cols, axis=1, inplace=True)

In [None]:
def run_linear(func, name):
    df = pd.DataFrame(columns=['params', f'r_squared_{name}'])
    for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
        for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
            if RECOGNITION_CUT < EARLY_CAREER: continue
            H = X[[
                #'max_absence-0-3', 'avg_absence-0-3',
                   'gender_f', 'gender_m', 'gender_none',
                   f'early_career_degree_{EARLY_CAREER}', 
                   f'early_career_prod_{EARLY_CAREER}',
                   f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
            reg = func.fit(H, y)
            df = df.append({'params': f'EC:{EARLY_CAREER},REC:{RECOGNITION_CUT}',
                            f'r_squared_{name}': reg.score(H, y)}, ignore_index=True)
    return df

In [None]:
def run_cv(func, name, cv, y_col='succ_after_15y'):
    df = pd.DataFrame(columns=['params', f'r_squared_{name}'])
    for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
        for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
            if RECOGNITION_CUT < EARLY_CAREER: continue
            H = X[[
                #'max_absence-0-3', 'avg_absence-0-3',
                   'gender_f', 'gender_m', 'gender_none',
                   f'early_career_degree_{EARLY_CAREER}', 
                   f'early_career_prod_{EARLY_CAREER}',
                   f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
            y = X[y_col]
            score = np.mean(cross_val_score(func, H, y, cv=cv, scoring='r2'))
            df = df.append({'params': f'EC:{EARLY_CAREER},REC:{RECOGNITION_CUT}',
                            f'r_squared_{name}': score}, ignore_index=True)
    return df

In [None]:
df1 = run_cv(LinearRegression(), 'linear', cv=3)

In [None]:
# df1_null = run_cv(LinearRegression(), 'linear_null', cv=3, y_col='succ_shuffled')

In [None]:
df2 = run_cv(ElasticNet(), 'elastic', cv=3)

In [None]:
df3 = run_cv(ElasticNetCV(cv=3), 'elastic_CV', cv=3)

In [None]:
df4 = run_cv(Lasso(alpha=0.1), 'lasso', cv=3)

In [None]:
# Decision tree overfits pretty bad. Maybe GridParam Search?
df5 = run_cv(DecisionTreeRegressor(), 'tree', cv=3)

In [None]:
df6 = run_cv(RandomForestRegressor(), 'forest', cv=3)

In [None]:
# df6_null = run_cv(RandomForestRegressor(), 'forest_null', cv=3, y_col='succ_shuffled')

In [None]:
dfs = [df1, df2, df3, df4, df5, df6] #df1_null, df6_null
for df_ in dfs: df_.set_index('params', inplace=True)

In [None]:
dfs[0].join(dfs[1:])

### Elastic net

In [9]:
years = sorted(credible_authors.start_year.unique())
cohort_start_years = [y for y in years if y < (END_YEAR - MAX_CAREER_LEN)]
# EARLY_CAREER = EARLY_CAREER_LEN_LIST[0]
# RECOGNITION_CUT = RECOGNITION_CUT_OFF_LIST[0]
EARLY_CAREER_LEN_LIST = [3,5,7,9,11,12]
RECOGNITION_CUT_OFF_LIST = [3,5,7,9,11,12]

In [10]:
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [86]:
# 4 groups of features: productivity, social capital, quality/rec and gender
def make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, INCLUDE_GENDER, REMOVE_NONE_AUTHORS, 
                    EARLY_CAREER, RECOGNITION_CUT):
    cols_std = []
    cols_all = ['cohort_size']
    
    if(INCLUDE_PROD):
        cols_std.append(f'early_career_prod_{EARLY_CAREER}')
        cols_all.append(f'early_career_prod_{EARLY_CAREER}')

    if(INCLUDE_SOCIAL):
        cols_std.append(f'early_career_degree_{EARLY_CAREER}')
        cols_all.append(f'early_career_degree_{EARLY_CAREER}')
        cols_std.append(f'early_career_coauthor_max_hindex_{EARLY_CAREER}')
        cols_all.append(f'early_career_coauthor_max_hindex_{EARLY_CAREER}')
    #     cols_std.append(f'early_career_coauthor_max_cit_{EARLY_CAREER}')
    #     cols_all.append(f'early_career_coauthor_max_cit_{EARLY_CAREER}')

    if(INCLUDE_REC):
        cols_std.append(f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}')
        cols_all.append(f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}')

    if(INCLUDE_QUALITY):
        cols_std.append(  f'early_career_qual_{EARLY_CAREER}')
        cols_all.append(  f'early_career_qual_{EARLY_CAREER}')

    if(INCLUDE_GENDER):
        cols_all.append('gender_m')
        cols_all.append('gender_f')
        if(not REMOVE_NONE_AUTHORS):
            cols_all.append('gender_none')


    cols_all.append('intercept')
    cols_all.append('r2')
    categorical_cols = ['gender']
    return cols_all, cols_std, categorical_cols
num_splits = 5

In [87]:
dep_var = f'h_index_increase_15_{EARLY_CAREER}'

In [106]:
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 0
REMOVE_NONE_AUTHORS = 0
INCLUDE_YEAR = 0
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, 
                                                       INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)
run_elastic_net_cohort(credible_authors,cols_all, cols_std, categorical_cols, EARLY_CAREER, dep_var)

Unnamed: 0_level_0,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,...,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cohort_size,390,508,609,767,847,813,993,1073,1039,1291,...,8899,9814,10019,10893,11872,13594,14939,17593,19195,21220
early_career_prod_3,0.37(0.05),0.45(0.06),0.52(0.05),0.51(0.04),0.48(0.03),0.49(0.03),0.61(0.05),0.41(0.11),0.57(0.04),0.55(0.04),...,1.15(0.01),1.26(0.05),1.21(0.04),1.14(0.02),1.15(0.03),1.19(0.03),1.16(0.03),1.2(0.01),1.12(0.01),1.04(0.01)
early_career_degree_3,0.0(0.0),-0.0(0.02),-0.0(0.01),0.04(0.02),0.06(0.01),0.03(0.02),0.04(0.03),0.05(0.06),-0.02(0.05),0.0(0.0),...,-0.03(0.01),-0.04(0.01),-0.1(0.02),-0.03(0.01),-0.04(0.01),-0.04(0.03),-0.03(0.02),-0.02(0.01),-0.05(0.01),-0.0(0.0)
early_career_coauthor_max_hindex_3,0.0(0.0),0.0(0.0),0.48(0.12),0.08(0.19),0.47(0.08),0.26(0.04),0.18(0.08),0.13(0.05),0.39(0.03),0.26(0.05),...,0.48(0.02),0.46(0.02),0.46(0.02),0.46(0.01),0.59(0.02),0.61(0.04),0.59(0.03),0.63(0.03),0.81(0.02),0.71(0.03)
early_career_recognition_EC3_RC3,0.03(0.03),0.03(0.01),-0.03(0.03),0.07(0.02),0.03(0.01),0.03(0.03),0.05(0.02),0.08(0.01),0.01(0.02),0.02(0.01),...,0.11(0.01),0.08(0.02),0.09(0.02),0.1(0.02),0.09(0.01),0.08(0.01),0.11(0.01),0.11(0.01),0.07(0.01),0.07(0.01)
gender_m,0.0(0.0),0.0(0.0),0.0(0.01),0.0(0.0),0.17(0.1),0.07(0.08),0.0(0.0),0.07(0.15),0.0(0.0),0.0(0.0),...,0.06(0.03),0.02(0.03),0.07(0.05),0.0(0.0),0.03(0.02),-0.01(0.02),0.0(0.0),0.0(0.0),0.0(0.0),0.0(0.0)
gender_f,0.0(0.0),0.0(0.0),0.0(0.0),0.01(0.01),0.0(0.0),0.0(0.0),0.0(0.0),-0.01(0.02),-0.06(0.13),0.0(0.0),...,-0.0(0.0),0.01(0.01),0.0(0.0),0.02(0.04),0.0(0.0),0.0(0.0),-0.02(0.03),-0.15(0.03),-0.13(0.03),-0.16(0.08)
gender_none,0.0(0.0),0.0(0.0),-0.01(0.01),-0.0(0.0),-0.0(0.0),0.0(0.0),-0.02(0.05),0.0(0.0),0.01(0.03),0.0(0.0),...,-0.0(0.0),-0.16(0.05),-0.07(0.05),-0.02(0.04),-0.03(0.05),0.02(0.02),0.07(0.05),0.14(0.06),0.12(0.05),0.07(0.05)
intercept,0.93,0.81,0.87,0.83,0.73,0.77,0.86,1.16,0.9,1.19,...,2.02,2.38,2.33,2.34,2.42,2.46,2.48,2.5,2.5,2.58
r2,0.16,0.14,0.21,0.22,0.25,0.24,0.2,0.1,0.23,0.17,...,0.23,0.22,0.21,0.23,0.22,0.23,0.23,0.27,0.25,0.26


In [110]:
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 1
REMOVE_NONE_AUTHORS = 0
INCLUDE_YEAR = 1
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, 
                                                       INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)
run_elastic_net_aggr(credible_authors,cols_all, cols_std, categorical_cols, EARLY_CAREER, dep_var)

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
early_career_prod_3,0.25(0.0)
early_career_degree_3,-0.01(0.0)
early_career_coauthor_max_hindex_3,0.02(0.0)
early_career_recognition_EC3_RC3,-0.02(0.0)
early_career_qual_3,0.01(0.0)
gender_f,0.0(0.0)
gender_m,0.0(0.0)
gender_none,0.0(0.0)
start_year,-0.61(0.01)
intercept,1.49


In [109]:
def scale_columns(X):
    if len(X.columns) > 0:
        standardized_cols = RobustScaler().fit_transform(X)
    else: 
        standardized_cols = []
    return pd.DataFrame(standardized_cols, index=X.index, columns=X.columns)

def prepare_data(credible_authors):
    X = credible_authors.copy()
    for year in cohort_start_years:
        X.loc[X.start_year == year, cols_std] = scale_columns(X.loc[X.start_year == year, cols_std])
    cat_cols = pd.get_dummies(X[categorical_cols]) 
    X = X[cols_std].join(cat_cols)
    X['start_year'] = credible_authors['start_year']
    return X
def run_elastic_net_aggr(credible_authors,cols_all, cols_std, categorical_cols, EARLY_CAREER, dep_var):
    X = prepare_data(credible_authors)
    Y = credible_authors[dep_var]
    X['start_year'] = RobustScaler().fit_transform(X['start_year'].to_frame())
    
    feat_table = run_elastic_net(X, Y)
    feat_table = feat_table.set_index(0)
    return feat_table

def run_elastic_net_cohort(credible_authors,cols_all, cols_std, categorical_cols, EARLY_CAREER, dep_var):
    feat_table = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
    X = prepare_data(credible_authors)
    for year in cohort_start_years:
        X_year = X[X.start_year == year]
        y_year = credible_authors[credible_authors.start_year == year][dep_var]
        
        feat_data = run_elastic_net(X_year.drop('start_year', axis=1), y_year)
        feat_data = feat_data.set_index(0)
        feat_data.rename(index=str, columns={1: year}, inplace=True)
       
        feat_table = feat_table.join(feat_data)
        
    return feat_table

def run_elastic_net(X, y):
    # train model and do cross validation
    cv_dict = cross_validate(ElasticNetCV(cv=3), X, y, scoring='r2', cv=5, return_estimator=True, return_train_score=False)
          
    score = np.mean(cv_dict['test_score'])
    # save the coefficients and intercepts
    net_coef = pd.DataFrame([es.coef_ for es in cv_dict['estimator']], columns=X.columns)
    net_intercept = np.mean([es.intercept_ for es in cv_dict['estimator']])
    # take the mean and std from coefs
    net_coef_mean = net_coef.mean()
    net_coef_std = net_coef.std()
    rounding = 2
    net_coef_mean_std = list(zip(np.round(net_coef_mean.values,rounding), np.round(net_coef_std.values,rounding)))
    net_coef_mean_std = [f"{x[0]}({x[1]})" for x in net_coef_mean_std]

    cohort_size = len(y)
    #     num_nonzero_coefs = sum(net2.coef_ != 0)
    #     adj_score2 = 1 - (1-score2)*(cohort_size-1)/(cohort_size-num_nonzero_coefs-1)
    net_coef_mean_std.extend([np.round(net_intercept, rounding), np.round(score, rounding), cohort_size])
    feat_table = pd.DataFrame(list(zip(np.append(X.columns, ['intercept', 'r2', 'cohort_size']), net_coef_mean_std)))
    
    return feat_table

In [None]:
def run_elastic_net_cohort(credible_authors,cols_all, cols_std, categorical_cols, EARLY_CAREER):
    aggregated = False
    hindex_table = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
    citation_table = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
    dropout_table = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
    for year in cohort_start_years:
        credible_authors_year = credible_authors[credible_authors.start_year == year]
        X = credible_authors_year.copy()
        
        hindex_data, citation_data, dropout_data = run_elastic_net(X, cols_std, categorical_cols, EARLY_CAREER, year, aggregated)
       
        hindex_table = hindex_table.join(hindex_data)
        citation_table = citation_table.join(citation_data)
        dropout_table = dropout_table.join(dropout_data)
        
    return hindex_table, citation_table, dropout_table

def run_elastic_net_aggr(credible_authors, cols_std, categorical_cols, EARLY_CAREER):
        
    aggregated = True
    #add cohort start year as var
    categorical_cols.append("start_year")
    
    X = credible_authors.copy()
    
    return run_elastic_net(X, cols_std, categorical_cols, EARLY_CAREER, year=0, aggregated)
    


def run_elastic_net(X, cols_std, categorical_cols, EARLY_CAREER, year=0, aggregated):
   
    #remove non-gender rows
    if(REMOVE_NONE_AUTHORS):
        X = X[X["gender"]!="none"]

   
    # Make dummy values for categorical columns
    cat_cols = pd.get_dummies(X[categorical_cols])

    
    #if(not REMOVE_NONE_AUTHORS):
        # drop gender none?
         # this is removing rows gender_none col
        #gender_cols.drop('gender_none', axis=1, inplace=True)

    #standardize cols_std
    if len(cols_std)>0:
        standardized_cols = RobustScaler().fit_transform(X[cols_std])
        H = pd.DataFrame(standardized_cols, index=X.index, columns=cols_std)
    else:
        H = pd.DataFrame(index=X.index, columns=cols_std)
       
    H = H.join(cat_cols) 
    if(not INCLUDE_GENDER):
        H = H.drop(columns=['gender_f',  'gender_m',  'gender_none'])
    
    print(H.head())
    
    y = X[f'h_index_increase_15_{EARLY_CAREER}']
    y2 = X[f'citation_increase_15_{EARLY_CAREER}']
    y3 = X['dropped_after_10'].astype(int)

    #cv_dict = cross_validate(LinearRegressionCV(cv=3), H, y, scoring='r2', cv=10, return_estimator=True, return_train_score=False)
    #cv2_dict = cross_validate(LinearRegressionCV(cv=3), H, y2, scoring='r2', cv=10, return_estimator=True, return_train_score=False)
    #cv3_dict = cross_validate(LogisticRegressionCV(cv=3, penalty='l2'), H, y3, scoring="f1", cv=10, return_estimator=True, return_train_score=False)
    cv_dict = cross_validate(ElasticNetCV(cv=3), H, y, scoring='r2', cv=10, return_estimator=True, return_train_score=False)
    cv2_dict = cross_validate(ElasticNetCV(cv=3), H, y2, scoring='r2', cv=10, return_estimator=True, return_train_score=False)
    cv3_dict = cross_validate(LogisticRegressionCV(cv=3, penalty='l2'), H, y3, scoring="f1", cv=10, return_estimator=True, return_train_score=False)
          
    score = np.mean(cv_dict['test_score'])
    score2 = np.mean(cv2_dict['test_score'])
    score3 = np.mean(cv3_dict['test_score'])

    net_coef = pd.DataFrame([es.coef_ for es in cv_dict['estimator']], columns=H.columns)
    net2_coef = pd.DataFrame([es.coef_ for es in cv2_dict['estimator']], columns=H.columns)
    net3_coef = pd.DataFrame([es.coef_[0] for es in cv3_dict['estimator']], columns=H.columns)

    net_intercept = np.mean([es.intercept_ for es in cv_dict['estimator']])
    net2_intercept = np.mean([es.intercept_ for es in cv2_dict['estimator']])
    net3_intercept = np.mean([es.intercept_ for es in cv3_dict['estimator']])

    net_coef_mean, net2_coef_mean, net3_coef_mean = net_coef.mean(), net2_coef.mean(), net3_coef.mean()
    net_coef_std, net2_coef_std, net3_coef_std = net_coef.std(), net2_coef.std(), net3_coef.std()

    rounding = 2

    net_coef_mean_std = list(zip(np.round(net_coef_mean.values,rounding), np.round(net_coef_std.values,rounding)))
    net2_coef_mean_std = list(zip(np.round(net2_coef_mean.values,rounding), np.round(net2_coef_std.values,rounding)))
    net3_coef_mean_std = list(zip(np.round(net3_coef_mean.values,rounding), np.round(net3_coef_std.values,rounding)))
        
    net_coef_mean_std = [f"{x[0]}({x[1]})" for x in net_coef_mean_std]
    net2_coef_mean_std = [f"{x[0]}({x[1]})" for x in net2_coef_mean_std]
    net3_coef_mean_std = [f"{x[0]}({x[1]})" for x in net3_coef_mean_std]

    cohort_size = len(y2)
    #     num_nonzero_coefs = sum(net2.coef_ != 0)
    #     adj_score2 = 1 - (1-score2)*(cohort_size-1)/(cohort_size-num_nonzero_coefs-1)
    net_coef_mean_std.extend([np.round(net_intercept, rounding), np.round(score, rounding), cohort_size])
    net2_coef_mean_std.extend([np.round(net2_intercept, rounding), np.round(score2, rounding), cohort_size])
    net3_coef_mean_std.extend([np.round(net3_intercept, rounding), np.round(score3, rounding), cohort_size])

    if(year>0):
        hindex_data = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                              net_coef_mean_std)), 
                                     columns=['year', year]).set_index('year')
        citation_data = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                              net2_coef_mean_std)), 
                                     columns=['year', year]).set_index('year')
        dropout_data = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                              net3_coef_mean_std)),
                                     columns=['year', year]).set_index('year')
    else:
        hindex_data = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                              net_coef_mean_std)))
        citation_data = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                              net2_coef_mean_std)))                                  
        dropout_data = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                              net3_coef_mean_std)))
    
    return hindex_data, citation_data, dropout_data

#### Feature tables

In [None]:
def make_result_table(feature_table):
    results = feature_table.transpose()
    #shorten column names
    new_cols = dict(zip(results.columns, [col.replace('early_career', 'ec') for col in results.columns]))

    results.rename(new_cols, axis='columns', inplace=True)
    results.rename({'feature':'cohort','ec_coauthor_max_cit_3': 'ec_coauth_max_cit_3', 'ec_recognition_EC3_RC5':'ec_recog_EC3_RC5'}, axis='columns', inplace=True)
    return results

In [None]:
def results_to_latex(results, name):
    ltx_file = open(f"results_{name}.tex", "w")
    ltx_file.write('\n'.join(results.to_latex().split('\n')[4:-3]))
    ltx_file.close()

In [None]:
EARLY_CAREER = 3
RECOGNITION_CUT = 3

## Aggregated Elastic Net Models
First we test the effect of different groups of features (human capital, social capital and gender) on success/dropout

In [None]:
#MODEL 1: null model

INCLUDE_PROD = 0
INCLUDE_SOCIAL = 0
INCLUDE_REC = 0
INCLUDE_GENDER = 0
INCLUDE_QUALITY = 0
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, 
                                                       INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)

#hindex_table, citation_table, dropout_table =  run_elastic_predictions(cols_all, cols_std, categorical_cols, EARLY_CAREER)


hindex_table, citation_table, dropout_table = run_elastic_net_aggr(credible_authors, cols_std, categorical_cols, EARLY_CAREER)



#hindex_table, citation_table, dropout_table =  run_elastic_net_cohort(credible_authors, cols_all, cols_std, categorical_cols, EARLY_CAREER)
#results_hindex = make_result_table(hindex_table)
#results_citation = make_result_table(citation_table)
#results_dropouts = make_result_table(dropout_table)


In [None]:
print("hindex_table")
print(hindex_table)
print("citation_table")
print(citation_table)
print("dropout_table")
print(dropout_table)

In [None]:
#MODEL 2: gender effect model

INCLUDE_PROD = 1
INCLUDE_SOCIAL = 0
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 0
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, 
                                                       INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)

hindex_table, citation_table, dropout_table = run_elastic_net_aggr(credible_authors, cols_std, categorical_cols, EARLY_CAREER)
print("hindex_table")
print(hindex_table)
print("citation_table")
print(citation_table)
print("dropout_table")
print(dropout_table)

#feature_table, feature_table2, feature_table3 = run_elastic_net_cohort(cols_all, cols_std, categorical_cols, EARLY_CAREER)
#results = make_result_table(feature_table)
#results2 = make_result_table(feature_table2)
#results3 = make_result_table(feature_table3)



In [None]:
#MODEL 3: social capital effect model

INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 0
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, 
                                                       INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)

#feature_table, feature_table2, feature_table3 = run_elastic_bet_cohort(cols_all, cols_std, categorical_cols, EARLY_CAREER)
#results = make_result_table(feature_table)
#results2 = make_result_table(feature_table2)
#results3 = make_result_table(feature_table3)

hindex_table, citation_table, dropout_table = run_elastic_net_aggr(credible_authors, cols_std, categorical_cols, EARLY_CAREER)
print("hindex_table")
print(hindex_table)
print("citation_table")
print(citation_table)
print("dropout_table")
print(dropout_table)

In [None]:
#MODEL 4: full model
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 1
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, 
                                                       INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)

In [None]:
hindex_table, citation_table, dropout_table = run_elastic_net_aggr(credible_authors, cols_std, categorical_cols, EARLY_CAREER)
print("hindex_table")
print(hindex_table)
print("citation_table")
print(citation_table)
print("dropout_table")
print(dropout_table)

## Cohort Elastic Net Models
Second we compare the predictive performance across cohorts. We should plot R2 and F1 over cohorts.
Is predictive performance stable?

In [None]:
#full model
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 1
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, 
                                                       INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)
feature_table, feature_table2, feature_table3 = run_elastic_net_cohort(credible_authors, cols_all, cols_std, categorical_cols, EARLY_CAREER)

results = make_result_table(feature_table)
results2 = make_result_table(feature_table2)
results3 = make_result_table(feature_table3)



In [None]:
fig, ax = plt.subplots()
ax.plot(results.index, results[["r2"]])
ax.set(xlabel='cohorts', ylabel='R2',
       title='h-index increase prediction')
plt.show()

fig, ax = plt.subplots()
ax.plot(results2.index, results2[["r2"]])
ax.set(xlabel='cohorts', ylabel='R2',
       title='citation increase prediction')
plt.show()

fig, ax = plt.subplots()
ax.plot(results3.index, results3[["r2"]])
ax.set(xlabel='cohorts', ylabel='F1',
       title='dropout prediction')
plt.show()

In [None]:
# model without quality 
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 0
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, 
                                                       INCLUDE_QUALITY, INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)
feature_table, feature_table2, feature_table3 = run_elastic_net_cohort(credible_authors, cols_all, cols_std, categorical_cols, EARLY_CAREER)
results_no_qual = make_result_table(feature_table)
results2_no_qual = make_result_table(feature_table2)
results3_no_qual = make_result_table(feature_table3)



In [None]:
results3_no_qual

In [None]:
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 0
INCLUDE_QUALITY = 1
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC,
                                                       INCLUDE_QUALITY, INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)
feature_table, feature_table2, feature_table3 = run_elastic_net_cohort(credible_authors, cols_all, cols_std, categorical_cols, EARLY_CAREER)
results_no_gen = make_result_table(feature_table)
results2_no_gen = make_result_table(feature_table2)
results3_no_gen = make_result_table(feature_table3)



In [None]:
results3_no_gen

In [None]:
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 0
INCLUDE_QUALITY = 0
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC,
                                                       INCLUDE_QUALITY, INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)
feature_table, feature_table2, feature_table3 = run_elastic_net_cohort(credible_authors, cols_all, cols_std, categorical_cols, EARLY_CAREER)
results_no_qual_no_gen = make_result_table(feature_table)
results2_no_qual_no_gen = make_result_table(feature_table2)
results3_no_qual_no_gen = make_result_table(feature_table3)



In [None]:
results3_no_qual_no_gen

In [None]:
results['r2_no_qual_no_gen'] = results_no_qual_no_gen['r2']
results2['r2_no_qual_no_gen'] = results2_no_qual_no_gen['r2']
results3['f1_no_qual_no_gen'] = results3_no_qual_no_gen['r2']

In [None]:
results['r2_no_qual'] = results_no_qual['r2']
results2['r2_no_qual'] = results2_no_qual['r2']
results3['f1_no_qual'] = results3_no_qual['r2']

In [None]:
results['r2_no_gen'] = results_no_gen['r2']
results2['r2_no_gen'] = results2_no_gen['r2']
results3['f1_no_gen'] = results3_no_gen['r2']

In [None]:
dropped_percent = credible_authors.groupby('start_year')['dropped_after_10'].sum() / credible_authors.groupby('start_year')['dropped_after_10'].count()

In [None]:
results3['drop_percentage'] = dropped_percent.round(2)

In [None]:
results3.columns

In [None]:
results3 = results3.reindex([results3.columns[0]] + [results3.columns[-1]] + list(results3.columns[1:-1]), axis=1)

In [None]:
results2.shape

In [None]:
results.tail()
#h_index_increase
# train_test results, compare with 'r2' col
# Year: 1999, r2: 0.29575585145072425
# Year: 2000, r2: 0.3288047075331689
# Year: 2001, r2: 0.32359617501275245
# Year: 2002, r2: 0.32367401156648834

In [None]:
results2.tail()
#citation_increase
# train_test results
# Year: 1999, r2: 0.26267137837811527
# Year: 2000, r2: 0.3229074740899605
# Year: 2001, r2: 0.2866603572613974
# Year: 2002, r2: 0.30948376141611045

In [None]:
results3.tail(4)
#dropouts
# train_test results
# Year: 1999, r2: 0.7871975797159264
# Year: 2000, r2: 0.7755906039107061
# Year: 2001, r2: 0.7776091854493229
# Year: 2002, r2: 0.7783668951214104

In [None]:
results_to_latex(results, 'hindex')
results_to_latex(results2, 'cit')
results_to_latex(results3, 'dropout')

In [None]:
results
# h index increase
#results.to_latex()

In [None]:
results2
# citation increase

In [None]:
results3
# coefficients are not exponentiated
# positive means bigger change to drop

In [None]:
results3

#### Test train split 80-20

In [None]:
def run_elastic_predictions_test_train(cols_all, cols_std, categorical_cols, EARLY_CAREER):
    feature_table = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
    feature_table2 = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
    feature_table3 = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
 
    for year in [1970,1999,2000,2001,2002]:
        credible_authors_year = credible_authors[credible_authors.start_year == year]

        X = credible_authors_year.copy()

        #remove non-gender rows
        if(REMOVE_NONE_AUTHORS):
            X = X[X["gender"]!="none"]

        # Make dummy values for categorical columns
        gender_cols = pd.get_dummies(X[categorical_cols])

        #if(not REMOVE_NONE_AUTHORS):
            # drop gender none?
            # this is removing rows gender_none col
            #gender_cols.drop('gender_none', axis=1, inplace=True)

        #standardize cols_std
        if len(cols_std)>0:
            standardized_cols = RobustScaler().fit_transform(X[cols_std])


        # claudia: here we could do a 20:80 split and save 20% for later test

        #combine
        H = pd.DataFrame(standardized_cols, index=X.index, columns=cols_std)
        if(INCLUDE_GENDER):
            H = H.join(gender_cols)

        y = X[f'h_index_increase_15_{EARLY_CAREER}']
        y2 = X[f'citation_increase_15_{EARLY_CAREER}']
        y3 = X['dropped_after_10'].astype(int)
        
        f1_dropout_list=[]
        r2_hindex_list=[]
        for i in range(10):
            #dropouts
            X_train, X_test, y_train, y_test = train_test_split(H, y3, test_size=0.2)
            rgs = LogisticRegressionCV(cv=3) #, penalty='l2', solver='liblinear'
            #rgs = SGDClassifier(alpha=0.0001, average=False, class_weight=None,
            #           early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
            #           l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000,
            #           n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
            #           power_t=0.5, random_state=None, shuffle=True, tol=0.001,
            #           validation_fraction=0.1, verbose=0, warm_start=False)
        
            rgs.fit(X_train, y_train)
            f1_dropout = f1_score(y_test, rgs.predict(X_test))
            f1_dropout_list.append(f1_dropout)
            
            #h-index increase
            X_train, X_test, y_train, y_test = train_test_split(H, y, test_size=0.2)
            rgs = ElasticNetCV(cv=3)
            #rgs = ElasticNetCV(cv=3, random_state=1000, max_iter=10000,
            #       alphas=[1.0], l1_ratio=0.5)
            rgs.fit(X_train, y_train)
            r2_hindex = r2_score(y_test, rgs.predict(X_test))
            print(rgs.alpha_)
            r2_hindex_list.append(r2_hindex)
            
        print(f"Year: {year}, f1_dropout: {np.mean(f1_dropout_list)}")
        print(f"Year: {year}, r2_hindex: {np.mean(r2_hindex_list)}")

In [None]:
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 1
cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC,
                                                       INCLUDE_QUALITY, INCLUDE_GENDER, REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)
run_elastic_predictions_test_train(cols_all, cols_std, categorical_cols, EARLY_CAREER)

#### Test predictive power over different number of observed years

In [None]:
INCLUDE_PROD = 1
INCLUDE_SOCIAL = 1
INCLUDE_REC = 1
INCLUDE_GENDER = 1
INCLUDE_QUALITY = 0

EARLY_CAREER_LEN_LIST_EXT = [3,5,7,9,11,12]
RECOGNITION_CUT_OFF_LIST_EXT = [3,5,7,9,11,12]

In [None]:
credible_authors.columns

In [None]:
r2_increase = pd.DataFrame(index=years)
for EARLY_CAREER, RECOGNITION_CUT in zip(EARLY_CAREER_LEN_LIST_EXT, RECOGNITION_CUT_OFF_LIST_EXT):
    cols_all, cols_std, categorical_cols = make_cols_lists(INCLUDE_PROD, INCLUDE_SOCIAL, INCLUDE_REC, INCLUDE_QUALITY, INCLUDE_GENDER, 
                                         REMOVE_NONE_AUTHORS, EARLY_CAREER, RECOGNITION_CUT)
    feature_table, feature_table2, feature_table3 = run_elastic_net_cohort(credible_authors, cols_all, cols_std, categorical_cols, EARLY_CAREER)
    results= make_result_table(feature_table)
    results2 = make_result_table(feature_table2)
    results3 = make_result_table(feature_table3)
    r2_increase[f'h_ind_{EARLY_CAREER}'] = results['r2']
    r2_increase[f'cit_{EARLY_CAREER}'] = results2['r2']
    r2_increase[f'drop_{EARLY_CAREER}'] = results3['r2']
    print(f"Year: {EARLY_CAREER}")
    print(cols_all)
    print(r2_increase[f'h_ind_{EARLY_CAREER}'][2002])

In [None]:
r2_hind_increase = r2_increase[[f"h_ind_{x}" for x in EARLY_CAREER_LEN_LIST]]
r2_cit_increase = r2_increase[[f"cit_{x}" for x in EARLY_CAREER_LEN_LIST]]
r2_drop_increase = r2_increase[[f"drop_{x}" for x in EARLY_CAREER_LEN_LIST]]

In [None]:
r2_hind_increase.columns = ['3','5','7','9','11','12']

In [None]:
plt = plot.init_plotting()
fig2 = plt.figure()
fig2.patch.set_facecolor('white')
ax2 = fig2.add_subplot(1,1,1) #, axisbg="white"
colors = ('#DE4C2C', '#3BD64C', '#3B9ED6', '#B73BD6', '#F39C12', '#FFC0CB', '#27AE60', '#48C9B0', '#071019') #'#AAB7B8',
markers = []
for m in Line2D.markers:
    try:
        if m != ' ' and m != '':
            markers.append(m)
    except TypeError:
        print("Typeerror occured")
        pass
p=0
for row in r2_hind_increase.index:

    cohort = r2_hind_increase.loc[year]
    
    if row % 5 == 0:
        ax2.plot(r2_hind_increase.loc[row], label=row, color=colors[p],
                     marker=markers[p], markersize=10)
        p = p+1 
    else:
        ax2.plot(r2_hind_increase.loc[row].values, label=None ,color='grey', alpha=0.5)

plt.title("Predicting h-index increase")
ax2.set_ylabel("R sqaured", fontweight='bold')
ax2.set_xlabel('Number of years observed', fontweight='bold')

ax2.legend()
plt.show()

In [None]:
r2_hind_increase

for row in r2_hind_increase.index:
    plt.plot(r2_hind_increase.loc[row])

In [None]:
r2_cit_increase.T.plot()

In [None]:
r2_drop_increase.T.plot()

In [None]:
stop

#### predictor diffs

In [None]:
h_index = feature_table.transpose().copy()
citations = feature_table2.transpose().copy()

In [None]:
plt.plot(h_index['r2'], label='Increase H index')
plt.plot(citations['r2'], label='Increase Citations')
print("Average difference in r squared", sum(citations['r2']-h_index['r2'])/len(h_index['r2']))
# quality was used as a feature!
plt.legend()
plt.show()

#### gender diffs

In [None]:
# gender coefs
plt.plot(results3['gender_m'], label="Male")
plt.plot(results3['gender_f'], label="Female")
plt.legend()
plt.show()

In [None]:
plt.plot(results3['gender_m'] - results3['gender_f'], label="Male-Female diff")
plt.plot(results.index ,np.zeros(len(results)))
plt.legend()
plt.show()

#### cohort size diffs

In [None]:
fig, ax1 = plt.subplots()
# ax1.plot(results['r2'], label='r2')
ax1.plot(results['adj_r2'], label='adjusted r2', color='C2')
ax1.set_ylim([0,1])
ax1.set_xlabel('Years')
ax1.set_ylabel('R squared', color='C2')
ax1.legend(loc=2)

ax2 = ax1.twinx()
ax2.plot(results['cohort_size'], label='Cohort size', color='C3')
ax2.set_ylabel('Cohort size', color='C3')
ax2.legend(loc=4)
plt.show()

#### cheating diffs

In [None]:
no_cheating = feature_table2.transpose().copy()

In [None]:
cheat_RC5 = feature_table2.transpose().copy()

In [None]:
cheat_quality = feature_table2.transpose().copy()

In [None]:
# How does removing the quality affect the r squared?
plt.plot(with_quality['adj_r2'], label='With quality')
plt.plot(cheat_RC5['adj_r2'], label='With recognition year 5')
plt.plot(no_cheating['adj_r2'], label='No cheating')
print("Average difference in r squared", sum(with_quality['adj_r2']-no_cheating['adj_r2'])/len(cheat_quality))
print(np.mean)
plt.legend()
plt.show()

avg = sum(cheat_quality['adj_r2']-no_cheating['adj_r2'])/len(no_cheating)
plt.plot(cheat_quality['adj_r2']-no_cheating['adj_r2'], label='Difference')
plt.plot(no_cheating.index, [avg]*len(no_cheating), label='Average diff')
plt.title("Difference between quality(15y) and recognition(3y)")
plt.legend()
plt.show()

#### scaler diffs

In [None]:
std_scaler = feature_table2.transpose().copy()

In [None]:
rob_scaler = feature_table2.transpose().copy()

In [None]:
# How does changing the scaler affect the r squared?
plt.plot(std_scaler['adj_r2'], label='Std')
plt.plot(rob_scaler['adj_r2'], label='Rob')
print("Average difference in r squared", sum(std_scaler['adj_r2']-rob_scaler['adj_r2'])/len(rob_scaler))
plt.legend()
plt.show()
# almost no difference

avg = sum(std_scaler['adj_r2']-rob_scaler['adj_r2'])/len(std_scaler)
plt.plot(std_scaler['adj_r2']-rob_scaler['adj_r2'], label='Difference')
plt.plot(std_scaler.index, [avg]*len(std_scaler), label='Average diff')
plt.legend()
plt.show()

In [None]:
# feature_table3.transpose()

### Best feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE, RFECV
from collections import defaultdict

In [None]:
def show_kbest(data_frame, target, linear_rel=True, k=4):
    """
    Selecting K-Best features for classification
    :param data_frame: A pandas dataFrame with the training data
    :param target: target variable name in DataFrame
    :param k: desired number of features from the data
    :returns feature_scores: scores for each feature in the data as 
    pandas DataFrame
    """
    scores = []
    if linear_rel == True: 
        feat_selector = SelectKBest(f_regression, k=k)
        col_name = "F Score"
    else:
        feat_selector = SelectKBest(mutual_info_regression, k=k)
        col_name = "Mutual Information"
    
    feat_selector = feat_selector.fit(data_frame, target)
    feat_scores = pd.DataFrame()
    feat_scores[col_name] = feat_selector.scores_
    feat_scores["P Value"] = feat_selector.pvalues_
    feat_scores["Support"] = feat_selector.get_support()
    feat_scores["Attribute"] = data_frame.columns
    
    return feat_scores 

def get_features_rfe(data_frame, target, model,k=5):
    """
    Returns list of features (k specified) selected using RFE for
    :param data_frame: A pandas dataFrame with features and labels
    :param k: top k features to select  
    :returns list: most relevant features 
    """
    X = data_frame
    y = target
    selector = RFE(model, k, step=1)
    selector = selector.fit(X, y)
#     print(selector.support_)
    df = pd.DataFrame({
        "feature": X.columns,
        "support": selector.support_
    })
    return df

def get_features_rfecv(data_frame, target, model, cv=3):
    """
    Returns list of features (k specified) selected using RFE for
    :param data_frame: A pandas dataFrame with features and labels
    :param k: top k features to select  
    :returns list: most relevant features 
    """
    X = data_frame
    y = target
    selector = RFECV(model, step=1, cv=cv)
    selector = selector.fit(X, y)
#     print(selector.support_)
    df = pd.DataFrame({
        "feature": X.columns,
        "support": selector.support_
    })
    return df

In [None]:
years = credible_authors.start_year.unique()
years = sorted(years)

In [None]:
df = credible_authors.copy()

In [None]:
categorical_cols = ['gender']

for col in categorical_cols:
    df = df.join(pd.get_dummies(df[categorical_cols]))

df.drop(categorical_cols, axis=1, inplace=True)

#### Show k best - F regression or mutual information

In [None]:
linear = True
# true - fregression
# false - mutual info

In [None]:
params = []
for year in years:
    df_year = df[df.start_year == year]
    df_year = df_year.drop('start_year', axis=1)
#     for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
#         for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
#             if RECOGNITION_CUT < EARLY_CAREER: continue
    EARLY_CAREER = 3
    RECOGNITION_CUT= 5
    X_year = df_year[['max_absence-0-3', 'avg_absence-0-3', 
           'gender_f', 'gender_m', 'gender_none',
           f'early_career_degree_{EARLY_CAREER}', 
           f'early_career_prod_{EARLY_CAREER}',
           f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
    y_year = df_year['succ_after_15y']
    params.append(show_kbest(X_year, y_year, linear, 5))

In [None]:
params[0]

In [None]:
selected_f = defaultdict(int)
for param in params:
    selected = param[param.Support == True]['Attribute'].values
    selected_f['total'] += 1
#     print(selected)
    for select in selected:
        selected_f[select] += 1

In [None]:
selected_f

#### RFE

In [None]:
params_rfe = []
for year in years:
    df_year = df[df.start_year == year]
    df_year = df_year.drop('start_year', axis=1)
#     for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
#         for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
#             if RECOGNITION_CUT < EARLY_CAREER: continue
    EARLY_CAREER = 3
    RECOGNITION_CUT= 5
    X_year = df_year[[
        #'max_absence-0-3', 'avg_absence-0-3', 
           'gender_f', 'gender_m', 'gender_none',
           f'early_career_degree_{EARLY_CAREER}', 
           f'early_career_prod_{EARLY_CAREER}',
           f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
    y_year = df_year['succ_after_15y']
    params_rfe.append(get_features_rfe(X_year, y_year, LinearRegression(),k=5))

In [None]:
selected_f = defaultdict(int)
for param in params_rfe:
    selected = param[param.support == True]['feature'].values
    selected_f['total'] += 1
#     print(selected)
    for select in selected:
        selected_f[select] += 1

In [None]:
selected_f

#### RFE CV

In [None]:
params_rfecv = []
for year in years:
    df_year = df[df.start_year == year]
    df_year = df_year.drop('start_year', axis=1)
#     for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
#         for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
#             if RECOGNITION_CUT < EARLY_CAREER: continue
    EARLY_CAREER = 3
    RECOGNITION_CUT= 5
    X_year = df_year[['max_absence-0-3', 'avg_absence-0-3', 
           'gender_f', 'gender_m', 'gender_none',
           f'early_career_degree_{EARLY_CAREER}', 
           f'early_career_prod_{EARLY_CAREER}',
           f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
    y_year = df_year['succ_after_15y']
    params_rfecv.append(get_features_rfecv(X_year, y_year, LinearRegression(),cv=3))

In [None]:
selected_f = defaultdict(int)
for param in params_rfecv:
    selected = param[param.support == True]['feature'].values
    selected_f['total'] += 1
#     print(selected)
    for select in selected:
        selected_f[select] += 1

In [None]:
selected_f

### Null experiment

In [None]:
citations_per_year = pd.read_csv('derived-data/paper-citation-count.csv', header=None, names=['pub_id', 'cit_count'])

In [None]:
publications = pd.read_csv('derived-data/author-publications.csv')

In [None]:
# publications.sort_values(by='author').head()

In [None]:
# remove authors by career_len, and add start year
publications = publications.merge(credible_authors[['author', 'start_year']], on='author')

In [None]:
publications = publications[publications.year <= publications.year + MAX_CAREER_LEN]

In [None]:
# citations_per_year.head()

In [None]:
publications['pub_id'] = shuffle(publications['pub_id']).reset_index(drop=True)

In [None]:
# publications.sort_values(by='author').head()

In [None]:
publications = publications.merge(citations_per_year, on='pub_id', how='left')
publications = publications.fillna(0)

In [None]:
publications.sort_values(by='author').head(20)

In [None]:
credible_authors[credible_authors.author == "a min tjoa"]['succ_after_15y']

In [None]:
credible_authors.set_index('author', inplace=True)

In [None]:
credible_authors['succ_shuffled'] = publications.groupby('author')['cit_count'].sum()

In [None]:
credible_authors[['succ_shuffled', 'succ_after_15y']].head()

In [None]:
credible_authors.columns