In [4]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import numpy as np

In [5]:
CAREER_LENGTH = 1

#EARLY_CAREER_LEN_LIST = [1, 2, 3, 4, 5]
EARLY_CAREER_LEN_LIST = [3]
#RECOGNITION_CUT_OFF_LIST = [3, 4, 5, 6, 7, 8, 9]
RECOGNITION_CUT_OFF_LIST = [5]

MAX_CAREER_LEN = 15
END_YEAR = 2018

In [6]:
credible_authors = pd.read_csv('derived-data/authors-scientific-extended.csv')

In [7]:
credible_authors.columns

Index(['author', 'start_year', 'end_year', 'total_num_pub', 'career_length',
       'max_absence-0-15', 'avg_absence-0-15', 'dropped_after_10', 'gender',
       'early_career_degree_3', 'early_career_qual_3', 'succ_after_15y',
       'early_career_prod_3', 'early_career_coauthor_max_cit_3',
       'early_career_recognition_EC3_RC3', 'early_career_recognition_EC3_RC5',
       'h-index_15', 'h-index_3'],
      dtype='object')

In [8]:
credible_authors = credible_authors[credible_authors.career_length >= CAREER_LENGTH]

In [9]:
credible_authors['citation_increase_15_3'] = credible_authors['succ_after_15y'] - credible_authors[
    'early_career_recognition_EC3_RC3']

In [10]:
credible_authors['h_index_increase_15_3'] = credible_authors['h-index_15'] - credible_authors['h-index_3']

## Data first look

In [None]:
credible_authors.columns

In [None]:
credible_authors.head()

## Correlations

In [None]:
cor_qual = credible_authors.corr()

In [None]:
cor_qual
#cor_qual['succ_after_15y'].sort_values()

In [None]:
#cor_rec = credible_authors[['early_career_recognition_EC1_RC3', 'early_career_recognition_EC1_RC5',
#       'early_career_recognition_EC1_RC7', 'early_career_recognition_EC1_RC9',
#       'early_career_recognition_EC2_RC3', 'early_career_recognition_EC2_RC5',
#       'early_career_recognition_EC2_RC7', 'early_career_recognition_EC2_RC9',
#       'early_career_recognition_EC3_RC3', 'early_career_recognition_EC3_RC5',
#       'early_career_recognition_EC3_RC7', 'early_career_recognition_EC3_RC9',
#       'early_career_recognition_EC4_RC5', 'early_career_recognition_EC4_RC7',
#       'early_career_recognition_EC4_RC9', 'early_career_recognition_EC5_RC5',
#       'early_career_recognition_EC5_RC7', 'early_career_recognition_EC5_RC9',
#       'succ_after_15y']].corr()

In [None]:
#cor_rec['succ_after_15y'].sort_values()

In [None]:
cor = credible_authors.corr()

In [None]:
cor['citation_increase_15_3'].sort_values()

In [None]:
sns.heatmap(cor, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

## Linear reg

In [11]:
from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import r2_score

from sklearn.model_selection import cross_val_score

from sklearn.utils import shuffle

### Test different predictors

In [12]:
# test different early career lenghts

In [13]:
year = 1995

credible_authors_1991 = credible_authors[credible_authors.start_year == year]

In [14]:
X = credible_authors_1991.copy()


In [15]:
categorical_cols = ['gender']

for col in categorical_cols:
    X = X.join(pd.get_dummies(X[categorical_cols]))

X.drop(categorical_cols, axis=1, inplace=True)

In [None]:
def run_linear(func, name):
    df = pd.DataFrame(columns=['params', f'r_squared_{name}'])
    for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
        for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
            if RECOGNITION_CUT < EARLY_CAREER: continue
            H = X[[
                #'max_absence-0-3', 'avg_absence-0-3',
                   'gender_f', 'gender_m', 'gender_none',
                   f'early_career_degree_{EARLY_CAREER}', 
                   f'early_career_prod_{EARLY_CAREER}',
                   f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
            reg = func.fit(H, y)
            df = df.append({'params': f'EC:{EARLY_CAREER},REC:{RECOGNITION_CUT}',
                            f'r_squared_{name}': reg.score(H, y)}, ignore_index=True)
    return df

In [None]:
def run_cv(func, name, cv, y_col='succ_after_15y'):
    df = pd.DataFrame(columns=['params', f'r_squared_{name}'])
    for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
        for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
            if RECOGNITION_CUT < EARLY_CAREER: continue
            H = X[[
                #'max_absence-0-3', 'avg_absence-0-3',
                   'gender_f', 'gender_m', 'gender_none',
                   f'early_career_degree_{EARLY_CAREER}', 
                   f'early_career_prod_{EARLY_CAREER}',
                   f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
            y = X[y_col]
            score = np.mean(cross_val_score(func, H, y, cv=cv, scoring='r2'))
            df = df.append({'params': f'EC:{EARLY_CAREER},REC:{RECOGNITION_CUT}',
                            f'r_squared_{name}': score}, ignore_index=True)
    return df

In [None]:
df1 = run_cv(LinearRegression(), 'linear', cv=3)

In [None]:
# df1_null = run_cv(LinearRegression(), 'linear_null', cv=3, y_col='succ_shuffled')

In [None]:
df2 = run_cv(ElasticNet(), 'elastic', cv=3)

In [None]:
df3 = run_cv(ElasticNetCV(cv=3), 'elastic_CV', cv=3)

In [None]:
df4 = run_cv(Lasso(alpha=0.1), 'lasso', cv=3)

In [None]:
# Decision tree overfits pretty bad. Maybe GridParam Search?
df5 = run_cv(DecisionTreeRegressor(), 'tree', cv=3)

In [None]:
df6 = run_cv(RandomForestRegressor(), 'forest', cv=3)

In [None]:
# df6_null = run_cv(RandomForestRegressor(), 'forest_null', cv=3, y_col='succ_shuffled')

In [None]:
dfs = [df1, df2, df3, df4, df5, df6] #df1_null, df6_null
for df_ in dfs: df_.set_index('params', inplace=True)

In [None]:
dfs[0].join(dfs[1:])

### Elastic net

In [16]:
years = sorted(credible_authors.start_year.unique())
cohort_start_years = [y for y in years if y < (END_YEAR - MAX_CAREER_LEN)]
EARLY_CAREER = EARLY_CAREER_LEN_LIST[0]
RECOGNITION_CUT = RECOGNITION_CUT_OFF_LIST[0]


In [17]:
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [90]:
cols_std = [f'early_career_degree_{EARLY_CAREER}',
       f'early_career_prod_{EARLY_CAREER}', f'early_career_coauthor_max_cit_{EARLY_CAREER}',
       f'early_career_qual_{EARLY_CAREER}', 
#             f'early_career_recognition_EC3_RC3',
#             f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}'
           ] 
cols_all = [f'early_career_degree_{EARLY_CAREER}',
       f'early_career_prod_{EARLY_CAREER}', f'early_career_coauthor_max_cit_{EARLY_CAREER}',
       f'early_career_qual_{EARLY_CAREER}', 
#             f'early_career_recognition_EC3_RC3',
#         f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}',
       'gender_m', 'gender_f',  'intercept', 'r2', 'cohort_size'] #'gender_none', 

categorical_cols = ['gender']
num_splits = 5

In [166]:
feature_table = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
feature_table2 = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')
feature_table3 = pd.DataFrame(cols_all, columns=['feature']).set_index('feature')

for year in cohort_start_years:
    credible_authors_year = credible_authors[credible_authors.start_year == year]

    X = credible_authors_year.copy()
    
    # Make dummy values for categorical columns
    gender_cols = pd.get_dummies(X[categorical_cols])
    # drop gender none?
    gender_cols.drop('gender_none', axis=1, inplace=True)
      
    #standardize cols_std
    standardized_cols = RobustScaler().fit_transform(X[cols_std])
    
    #combine
    H = pd.DataFrame(standardized_cols, index=X.index, columns=cols_std)
    H = H.join(gender_cols)

    y = X['h_index_increase_15_3']
    y2 = X['citation_increase_15_3']
    y3 = X['dropped_after_10'].astype(int)
    
    cv_dict = cross_validate(ElasticNetCV(cv=3), H, y, scoring='r2', cv=5, return_estimator=True, return_train_score=False)
    cv2_dict = cross_validate(ElasticNetCV(cv=3), H, y2, scoring='r2', cv=5, return_estimator=True, return_train_score=False)
    cv3_dict = cross_validate(LogisticRegressionCV(cv=3), H, y3, scoring="roc_auc", cv=5, return_estimator=True, return_train_score=False)

    score = np.mean(cv_dict['test_score'])
    score2 = np.mean(cv2_dict['test_score'])
    score3 = np.mean(cv3_dict['test_score'])
    
    net_coef = pd.DataFrame([es.coef_ for es in cv_dict['estimator']], columns=H.columns)
    net2_coef = pd.DataFrame([es.coef_ for es in cv2_dict['estimator']], columns=H.columns)
    net3_coef = pd.DataFrame([es.coef_[0] for es in cv3_dict['estimator']], columns=H.columns)
    
    net_intercept = np.mean([es.intercept_ for es in cv_dict['estimator']])
    net2_intercept = np.mean([es.intercept_ for es in cv2_dict['estimator']])
    net3_intercept = np.mean([es.intercept_ for es in cv3_dict['estimator']])
    
    net_coef_mean, net2_coef_mean, net3_coef_mean = net_coef.mean(), net2_coef.mean(), net3_coef.mean()
    net_coef_std, net2_coef_std, net3_coef_std = net_coef.std(), net2_coef.std(), net3_coef.std()
    
    net_coef_mean_std = list(zip(np.round(net_coef_mean.values,4), np.round(net_coef_std.values,4)))
    net2_coef_mean_std = list(zip(np.round(net2_coef_mean.values,4), np.round(net2_coef_std.values,4)))
    net3_coef_mean_std = list(zip(np.round(net3_coef_mean.values,4), np.round(net3_coef_std.values,4)))
    
    cohort_size = len(y2)
#     num_nonzero_coefs = sum(net2.coef_ != 0)
#     adj_score2 = 1 - (1-score2)*(cohort_size-1)/(cohort_size-num_nonzero_coefs-1)
    net_coef_mean_std.extend([net_intercept, score, cohort_size])
    net2_coef_mean_std.extend([net2_intercept, score2, cohort_size])
    net3_coef_mean_std.extend([net3_intercept, score3, cohort_size])
    
        
    year_data = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                      net_coef_mean_std)), 
                             columns=['year', year]).set_index('year')
    year_data2 = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                      net2_coef_mean_std)), 
                             columns=['year', year]).set_index('year')
    year_data3 = pd.DataFrame(list(zip(np.append(H.columns, ['intercept', 'r2', 'cohort_size']), 
                                      net3_coef_mean_std)),
                             columns=['year', year]).set_index('year')

    feature_table = feature_table.join(year_data)
    feature_table2 = feature_table2.join(year_data2)
    feature_table3 = feature_table3.join(year_data3)

#### Feature tables

In [167]:
results = feature_table.transpose()
#shorten column names
new_cols = dict(zip(results.columns, [col.replace('early_career', 'ec') for col in results.columns]))

results.rename(new_cols, axis='columns', inplace=True)
results.rename({'ec_coauthor_max_cit_3': 'ec_coauth_max_cit_3', 'ec_recognition_EC3_RC5':'ec_recog_EC3_RC5'}, axis='columns', inplace=True)

In [168]:
results2 = feature_table2.transpose()
results2.rename(new_cols, axis='columns', inplace=True)
results2.rename({'ec_coauthor_max_cit_3': 'ec_coauth_max_cit_3', 'ec_recognition_EC3_RC5':'ec_recog_EC3_RC5'}, axis='columns', inplace=True)

In [169]:
results3 = feature_table3.transpose()
results3.rename(new_cols, axis='columns', inplace=True)
results3.rename({'ec_coauthor_max_cit_3': 'ec_coauth_max_cit_3', 'ec_recognition_EC3_RC5':'ec_recog_EC3_RC5', 'r2':'aoc_roc'}, axis='columns', inplace=True)

In [170]:
results
# h index increase

feature,ec_degree_3,ec_prod_3,ec_coauth_max_cit_3,ec_qual_3,gender_m,gender_f,intercept,r2,cohort_size
1970,"(-0.004, 0.0089)","(0.3024, 0.0242)","(0.0043, 0.002)","(0.1322, 0.0118)","(0.0, 0.0)","(0.0, 0.0)",0.472821,0.252617,763
1971,"(0.0016, 0.0036)","(0.3842, 0.0405)","(0.0077, 0.0026)","(0.1014, 0.013)","(-0.0159, 0.0356)","(0.0, 0.0)",0.43754,0.234932,981
1972,"(-0.0529, 0.0083)","(0.3521, 0.0386)","(0.0209, 0.0035)","(0.1068, 0.018)","(0.0067, 0.0102)","(0.0, 0.0)",0.486,0.315058,1207
1973,"(0.0, 0.0)","(0.4173, 0.0426)","(0.0097, 0.002)","(0.1353, 0.0168)","(0.016, 0.0359)","(0.0, 0.0)",0.472783,0.301524,1400
1974,"(0.0004, 0.0011)","(0.4674, 0.0393)","(0.0078, 0.0012)","(0.0278, 0.0063)","(0.0403, 0.0437)","(0.0, 0.0)",0.382927,0.314848,1830
1975,"(0.0084, 0.0077)","(0.5271, 0.0182)","(0.0022, 0.0005)","(0.0372, 0.0079)","(0.0643, 0.0402)","(-0.0147, 0.0221)",0.394871,0.294794,1599
1976,"(0.0132, 0.0128)","(0.6149, 0.0447)","(0.0077, 0.002)","(0.0248, 0.0084)","(0.0233, 0.0201)","(-0.0078, 0.0174)",0.423446,0.251366,1996
1977,"(0.0021, 0.0079)","(0.4631, 0.0967)","(0.0122, 0.0042)","(0.0769, 0.0085)","(0.0537, 0.1201)","(-0.0151, 0.0338)",0.638291,0.182584,1929
1978,"(-0.067, 0.004)","(0.5237, 0.0362)","(0.0176, 0.0015)","(0.0883, 0.013)","(0.0264, 0.02)","(-0.2075, 0.0216)",0.546942,0.289341,1907
1979,"(-0.0204, 0.0209)","(0.6293, 0.0554)","(0.0207, 0.0029)","(0.0692, 0.0113)","(0.0129, 0.0287)","(0.0, 0.0)",0.623231,0.236591,2352


In [171]:
results2
# citation increase

feature,ec_degree_3,ec_prod_3,ec_coauth_max_cit_3,ec_qual_3,gender_m,gender_f,intercept,r2,cohort_size
1970,"(-0.5116, 0.344)","(4.0202, 0.8659)","(0.0482, 0.0558)","(7.4517, 0.5224)","(-0.0246, 0.0549)","(0.1002, 0.2141)",4.31868,0.3442,763
1971,"(-1.9881, 0.5748)","(7.1597, 1.2665)","(0.6844, 0.1121)","(7.5129, 0.5958)","(-1.2083, 1.1362)","(0.3777, 0.35)",1.6099,0.326153,981
1972,"(-2.8597, 0.2392)","(4.1514, 0.5967)","(1.4308, 0.2656)","(5.8608, 0.2309)","(0.0726, 0.1623)","(0.0013, 0.0028)",3.29327,0.517418,1207
1973,"(-0.7535, 0.3486)","(6.6863, 0.8308)","(0.3214, 0.0893)","(8.8849, 0.5057)","(0.0082, 0.4009)","(0.0, 0.0)",2.00213,0.444992,1400
1974,"(-0.514, 0.2149)","(4.5008, 0.6308)","(0.1828, 0.0464)","(4.2446, 0.3923)","(0.0813, 0.0784)","(0.0, 0.0)",1.94828,0.492613,1830
1975,"(-0.131, 0.141)","(7.308, 0.6714)","(0.038, 0.0362)","(5.967, 0.4829)","(0.4757, 0.3965)","(0.0, 0.0)",2.09648,0.599018,1599
1976,"(-1.3163, 0.6752)","(7.6514, 1.449)","(0.3894, 0.1574)","(4.5299, 0.275)","(0.0, 0.0)","(0.0, 0.0)",3.81921,0.573584,1996
1977,"(-0.7737, 0.7843)","(8.6916, 1.6313)","(0.6635, 0.3618)","(8.1059, 0.2889)","(0.2466, 1.0254)","(0.1069, 0.2199)",6.12134,0.355141,1929
1978,"(-2.7437, 0.1911)","(8.2188, 0.7062)","(0.7365, 0.0994)","(7.779, 0.4104)","(0.0542, 0.1212)","(-0.0416, 0.0922)",3.45566,0.454034,1907
1979,"(-1.9368, 0.4899)","(12.47, 1.537)","(0.6307, 0.1528)","(7.5026, 0.3839)","(0.6993, 0.5864)","(-0.1971, 0.319)",6.84644,0.364597,2352


In [172]:
results3
# coefficients are not exponentiated
# positive means bigger change to drop

feature,ec_degree_3,ec_prod_3,ec_coauth_max_cit_3,ec_qual_3,gender_m,gender_f,intercept,aoc_roc,cohort_size
1970,"(0.0608, 0.0449)","(-0.6923, 0.1367)","(-0.0001, 0.0032)","(-0.1433, 0.0226)","(-0.2291, 0.1632)","(-0.1853, 0.2779)",1.66297,0.730602,763
1971,"(0.1298, 0.0292)","(-1.1117, 0.1344)","(0.0031, 0.0073)","(-0.0535, 0.0181)","(-0.1881, 0.1159)","(-0.0364, 0.1229)",1.52003,0.741292,981
1972,"(0.3172, 0.0326)","(-0.7064, 0.0758)","(-0.023, 0.0089)","(-0.1066, 0.0194)","(-0.2717, 0.0913)","(-0.0493, 0.1621)",1.46119,0.728252,1207
1973,"(0.14, 0.0453)","(-0.7742, 0.0628)","(-0.0089, 0.0014)","(-0.1058, 0.0526)","(-0.1278, 0.1041)","(0.0753, 0.1249)",1.33548,0.713857,1400
1974,"(-0.0076, 0.0282)","(-0.8815, 0.0602)","(-0.0039, 0.0007)","(-0.0525, 0.0095)","(-0.1, 0.0831)","(0.0682, 0.106)",1.70144,0.730153,1830
1975,"(0.0586, 0.0182)","(-1.013, 0.0603)","(0.0028, 0.002)","(-0.0172, 0.0153)","(-0.2047, 0.0483)","(-0.0108, 0.0934)",1.50469,0.761717,1599
1976,"(-0.0207, 0.0181)","(-0.8526, 0.0517)","(-0.0049, 0.0022)","(-0.013, 0.0054)","(-0.2954, 0.0798)","(0.0622, 0.1808)",1.60503,0.71438,1996
1977,"(0.0146, 0.0281)","(-1.0203, 0.0267)","(-0.0019, 0.0038)","(-0.0426, 0.0341)","(-0.3936, 0.0935)","(0.0285, 0.1263)",1.46868,0.735793,1929
1978,"(0.0764, 0.0229)","(-0.8347, 0.0566)","(-0.0142, 0.0031)","(0.008, 0.015)","(-0.3082, 0.0385)","(0.3048, 0.0461)",1.36581,0.709366,1907
1979,"(0.0985, 0.0207)","(-0.9966, 0.0666)","(-0.0191, 0.0045)","(-0.0567, 0.015)","(-0.2551, 0.11)","(-0.1565, 0.1245)",1.41956,0.738867,2352


#### predictor diffs

In [None]:
h_index = feature_table.transpose().copy()
citations = feature_table2.transpose().copy()

In [None]:
plt.plot(h_index['r2'], label='Increase H index')
plt.plot(citations['r2'], label='Increase Citations')
print("Average difference in r squared", sum(citations['r2']-h_index['r2'])/len(h_index['r2']))
# quality was used as a feature!
plt.legend()
plt.show()

#### gender diffs

In [None]:
# gender coefs
plt.plot(results3['gender_m'], label="Male")
plt.plot(results3['gender_f'], label="Female")
plt.legend()
plt.show()

In [None]:
plt.plot(results3['gender_m'] - results3['gender_f'], label="Male-Female diff")
plt.plot(results.index ,np.zeros(len(results)))
plt.legend()
plt.show()

#### cohort size diffs

In [None]:
fig, ax1 = plt.subplots()
# ax1.plot(results['r2'], label='r2')
ax1.plot(results['adj_r2'], label='adjusted r2', color='C2')
ax1.set_ylim([0,1])
ax1.set_xlabel('Years')
ax1.set_ylabel('R squared', color='C2')
ax1.legend(loc=2)

ax2 = ax1.twinx()
ax2.plot(results['cohort_size'], label='Cohort size', color='C3')
ax2.set_ylabel('Cohort size', color='C3')
ax2.legend(loc=4)
plt.show()

#### cheating diffs

In [None]:
no_cheating = feature_table2.transpose().copy()

In [None]:
cheat_RC5 = feature_table2.transpose().copy()

In [None]:
cheat_quality = feature_table2.transpose().copy()

In [None]:
# How does removing the quality affect the r squared?
plt.plot(with_quality['adj_r2'], label='With quality')
plt.plot(cheat_RC5['adj_r2'], label='With recognition year 5')
plt.plot(no_cheating['adj_r2'], label='No cheating')
print("Average difference in r squared", sum(with_quality['adj_r2']-no_cheating['adj_r2'])/len(cheat_quality))
print(np.mean)
plt.legend()
plt.show()

avg = sum(cheat_quality['adj_r2']-no_cheating['adj_r2'])/len(no_cheating)
plt.plot(cheat_quality['adj_r2']-no_cheating['adj_r2'], label='Difference')
plt.plot(no_cheating.index, [avg]*len(no_cheating), label='Average diff')
plt.title("Difference between quality(15y) and recognition(3y)")
plt.legend()
plt.show()

#### scaler diffs

In [None]:
std_scaler = feature_table2.transpose().copy()

In [None]:
rob_scaler = feature_table2.transpose().copy()

In [None]:
# How does changing the scaler affect the r squared?
plt.plot(std_scaler['adj_r2'], label='Std')
plt.plot(rob_scaler['adj_r2'], label='Rob')
print("Average difference in r squared", sum(std_scaler['adj_r2']-rob_scaler['adj_r2'])/len(rob_scaler))
plt.legend()
plt.show()
# almost no difference

avg = sum(std_scaler['adj_r2']-rob_scaler['adj_r2'])/len(std_scaler)
plt.plot(std_scaler['adj_r2']-rob_scaler['adj_r2'], label='Difference')
plt.plot(std_scaler.index, [avg]*len(std_scaler), label='Average diff')
plt.legend()
plt.show()

In [None]:
# feature_table3.transpose()

### Best feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE, RFECV
from collections import defaultdict

In [None]:
def show_kbest(data_frame, target, linear_rel=True, k=4):
    """
    Selecting K-Best features for classification
    :param data_frame: A pandas dataFrame with the training data
    :param target: target variable name in DataFrame
    :param k: desired number of features from the data
    :returns feature_scores: scores for each feature in the data as 
    pandas DataFrame
    """
    scores = []
    if linear_rel == True: 
        feat_selector = SelectKBest(f_regression, k=k)
        col_name = "F Score"
    else:
        feat_selector = SelectKBest(mutual_info_regression, k=k)
        col_name = "Mutual Information"
    
    feat_selector = feat_selector.fit(data_frame, target)
    feat_scores = pd.DataFrame()
    feat_scores[col_name] = feat_selector.scores_
    feat_scores["P Value"] = feat_selector.pvalues_
    feat_scores["Support"] = feat_selector.get_support()
    feat_scores["Attribute"] = data_frame.columns
    
    return feat_scores 

def get_features_rfe(data_frame, target, model,k=5):
    """
    Returns list of features (k specified) selected using RFE for
    :param data_frame: A pandas dataFrame with features and labels
    :param k: top k features to select  
    :returns list: most relevant features 
    """
    X = data_frame
    y = target
    selector = RFE(model, k, step=1)
    selector = selector.fit(X, y)
#     print(selector.support_)
    df = pd.DataFrame({
        "feature": X.columns,
        "support": selector.support_
    })
    return df

def get_features_rfecv(data_frame, target, model, cv=3):
    """
    Returns list of features (k specified) selected using RFE for
    :param data_frame: A pandas dataFrame with features and labels
    :param k: top k features to select  
    :returns list: most relevant features 
    """
    X = data_frame
    y = target
    selector = RFECV(model, step=1, cv=cv)
    selector = selector.fit(X, y)
#     print(selector.support_)
    df = pd.DataFrame({
        "feature": X.columns,
        "support": selector.support_
    })
    return df

In [None]:
years = credible_authors.start_year.unique()
years = sorted(years)

In [None]:
df = credible_authors.copy()

In [None]:
categorical_cols = ['gender']

for col in categorical_cols:
    df = df.join(pd.get_dummies(df[categorical_cols]))

df.drop(categorical_cols, axis=1, inplace=True)

#### Show k best - F regression or mutual information

In [None]:
linear = True
# true - fregression
# false - mutual info

In [None]:
params = []
for year in years:
    df_year = df[df.start_year == year]
    df_year = df_year.drop('start_year', axis=1)
#     for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
#         for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
#             if RECOGNITION_CUT < EARLY_CAREER: continue
    EARLY_CAREER = 3
    RECOGNITION_CUT= 5
    X_year = df_year[['max_absence-0-3', 'avg_absence-0-3', 
           'gender_f', 'gender_m', 'gender_none',
           f'early_career_degree_{EARLY_CAREER}', 
           f'early_career_prod_{EARLY_CAREER}',
           f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
    y_year = df_year['succ_after_15y']
    params.append(show_kbest(X_year, y_year, linear, 5))

In [None]:
params[0]

In [None]:
selected_f = defaultdict(int)
for param in params:
    selected = param[param.Support == True]['Attribute'].values
    selected_f['total'] += 1
#     print(selected)
    for select in selected:
        selected_f[select] += 1

In [None]:
selected_f

#### RFE

In [None]:
params_rfe = []
for year in years:
    df_year = df[df.start_year == year]
    df_year = df_year.drop('start_year', axis=1)
#     for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
#         for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
#             if RECOGNITION_CUT < EARLY_CAREER: continue
    EARLY_CAREER = 3
    RECOGNITION_CUT= 5
    X_year = df_year[[
        #'max_absence-0-3', 'avg_absence-0-3', 
           'gender_f', 'gender_m', 'gender_none',
           f'early_career_degree_{EARLY_CAREER}', 
           f'early_career_prod_{EARLY_CAREER}',
           f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
    y_year = df_year['succ_after_15y']
    params_rfe.append(get_features_rfe(X_year, y_year, LinearRegression(),k=5))

In [None]:
selected_f = defaultdict(int)
for param in params_rfe:
    selected = param[param.support == True]['feature'].values
    selected_f['total'] += 1
#     print(selected)
    for select in selected:
        selected_f[select] += 1

In [None]:
selected_f

#### RFE CV

In [None]:
params_rfecv = []
for year in years:
    df_year = df[df.start_year == year]
    df_year = df_year.drop('start_year', axis=1)
#     for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
#         for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
#             if RECOGNITION_CUT < EARLY_CAREER: continue
    EARLY_CAREER = 3
    RECOGNITION_CUT= 5
    X_year = df_year[['max_absence-0-3', 'avg_absence-0-3', 
           'gender_f', 'gender_m', 'gender_none',
           f'early_career_degree_{EARLY_CAREER}', 
           f'early_career_prod_{EARLY_CAREER}',
           f'early_career_qual_{EARLY_CAREER}', f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}']]
    y_year = df_year['succ_after_15y']
    params_rfecv.append(get_features_rfecv(X_year, y_year, LinearRegression(),cv=3))

In [None]:
selected_f = defaultdict(int)
for param in params_rfecv:
    selected = param[param.support == True]['feature'].values
    selected_f['total'] += 1
#     print(selected)
    for select in selected:
        selected_f[select] += 1

In [None]:
selected_f

### Null experiment

In [None]:
citations_per_year = pd.read_csv('derived-data/paper-citation-count.csv', header=None, names=['pub_id', 'cit_count'])

In [None]:
publications = pd.read_csv('derived-data/author-publications.csv')

In [None]:
# publications.sort_values(by='author').head()

In [None]:
# remove authors by career_len, and add start year
publications = publications.merge(credible_authors[['author', 'start_year']], on='author')

In [None]:
publications = publications[publications.year <= publications.year + MAX_CAREER_LEN]

In [None]:
# citations_per_year.head()

In [None]:
publications['pub_id'] = shuffle(publications['pub_id']).reset_index(drop=True)

In [None]:
# publications.sort_values(by='author').head()

In [None]:
publications = publications.merge(citations_per_year, on='pub_id', how='left')
publications = publications.fillna(0)

In [None]:
publications.sort_values(by='author').head(20)

In [None]:
credible_authors[credible_authors.author == "a min tjoa"]['succ_after_15y']

In [None]:
credible_authors.set_index('author', inplace=True)

In [None]:
credible_authors['succ_shuffled'] = publications.groupby('author')['cit_count'].sum()

In [None]:
credible_authors[['succ_shuffled', 'succ_after_15y']].head()

In [None]:
credible_authors.columns