In [132]:
from __future__ import division

from faculty_hiring.misc.util import *
from faculty_hiring.misc.plotting import *  # Definitions for LABEL_SIZE and such
from faculty_hiring.misc.gaussian_kde import gaussian_kde
from faculty_hiring.parse import faculty_parser, institution_parser
from faculty_hiring.misc.productivity import faculty_at_institution
from faculty_hiring.parse import load
from faculty_hiring.parse.nces import parse_phds_awarded
from faculty_hiring.misc.subfield import topic_descriptions, longer_topic_descriptions, num_topics 

from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.stats.stats import pearsonr, mannwhitneyu, ttest_ind
from scipy.stats import ttest_ind, ks_2samp, chi2_contingency, mannwhitneyu
from scipy.special import gamma as gammaf
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn.cross_validation import StratifiedKFold, LeaveOneOut
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
from collections import Counter
from scipy import interp
import statsmodels.api as sm
import numpy as np
import math
import scipy as sp
import pandas as pd
import os
import mpld3
import palettable

sig_level = 0.05

# File locations
faculty_file = '/Users/allisonmorgan/Documents/faculty_hiring/publication_data/current_data/faculty_cs_CURRENT.txt'
inst_file = '/Users/allisonmorgan/Documents/faculty_hiring/publication_data/current_data/inst_cs_CURRENT.txt'
figures_dir = '/Users/allisonmorgan/Code/src/github.com/allisonmorgan/faculty_hiring/notebooks-productivity/figures/'
#nces_table = '/Users/samway/Documents/Work/ClausetLab/projects/faculty_hiring/data/nces_table.txt'

# (Optional, these are for loading publication profiles)
dblp_dir = '/Users/allisonmorgan/Documents/faculty_hiring/publication_data/profiles_DBLP_Nov16/'

In [48]:
inst = institution_parser.parse_institution_records(open(inst_file))
asst_faculty = load.load_assistant_profs(open(faculty_file), inst)
ignore = ['All others', 'UNKNOWN', 'CUNY Graduate Center']

load.load_all_publications(asst_faculty, dblp_dir)

In [49]:
valid_titles = ['Assistant Professor', 'Associate Professor', 'Full Professor']

In [50]:
followup_data = '/Users/allisonmorgan/Code/src/github.com/allisonmorgan/faculty_hiring/notebooks-productivity/Finalized_Spreadsheet_Nov21.tsv'

def load_followup_data(faculty, followup_file):
    """ Loads the follow-up spreadsheet data collected by BuffTurkers in Nov2016 """
    df = pd.read_csv(followup_file, sep='\t')
    profiles_updated = 0
    
    for person in faculty:
        # When collection assignements are created, these fields are written out:
        # person.facultyName, person.place, person.current
        num_records = len(df[df['Faculty name'] == person.facultyName])
        if num_records == 0: # No matches, this is fine
            continue  
        elif num_records == 1: # One match, this is great
            row = df[df['Faculty name'] == person.facultyName]
            if row['Institution (2011)'].iloc[0] != person.place:
                continue
            person['followup_title'] = row['Job Title (2016)'].iloc[0]
            person['followup_location'] = row['Institution (2016)'].iloc[0]
            person['followup_department'] = row['Department (2016)'].iloc[0]
            profiles_updated += 1
        else: # Multiple matches, this... is bad. Blow up.
            raise ValueError('Multiple records detected!')
            
    assert (profiles_updated <= len(df))
    print 'Updated %d profiles!' % (profiles_updated)

load_followup_data(asst_faculty, followup_data)

Updated 555 profiles!


## Compare with Publications

In [51]:
trajectories = Counter()
for person in asst_faculty:
    if 'followup_title' in person and 'current' in person:
        key = "%s -> %s" % (person['current'], person['followup_title'])
        trajectories[key] += 1
trajectories

Counter({'Assistant Professor -> Assistant Professor': 104,
         'Assistant Professor -> Associate Professor': 366,
         'Assistant Professor -> Full Professor': 17,
         'Assistant Professor -> NOT TT': 68})

In [52]:
# Number of publications for all current assistant faculty of a university. Adjusted for inflation
# Set limit_by_years_post_hire flag to True to consider only the contributions within the first 5 
# years of a researcher's career.
limit_by_years_post_hire = True
ignore = ['All others', 'UNKNOWN', 'CUNY Graduate Center']
pubs_by_inst = {}
pubs_by_year_by_inst = {}
for (name, inst_data) in inst.items():
    if name in ignore:
        continue
    current_faculty = faculty_at_institution(name, asst_faculty)
    n_pubs = []; n_pubs_by_year = [];
    for person in current_faculty:
        start = person.first_asst_job_year
        if limit_by_years_post_hire:
            end = np.min([2012, person.first_asst_job_year + 5])
        else:
            end = 2012
        career_length = end - start
        
        if person.__contains__('dblp_pubs'):
            contribution = 0
            for pub in person.dblp_pubs:
                if 'year' in pub \
                      and pub['year'] >= start \
                      and pub['year'] < end \
                      and pub['pub_type'] in ['inproceedings', 'article']:
                    contribution += inflation_adjust(pub['year'])
            
            n_pubs.append((person.facultyName, contribution))
            n_pubs_by_year.append((person.facultyName, contribution/float(career_length)))
    if len(n_pubs) > 0:
        pubs_by_inst[name] = n_pubs
        pubs_by_year_by_inst[name] = n_pubs_by_year

In [113]:
# Compare the average publication rate of a person in their first five years, 
# to the average publication rate of faculty within their first five years at
# both the institution they came from and went to
publication_trajectories = []
for person in asst_faculty:
    if 'followup_title' in person and person.__contains__('dblp_pubs'):
        initial_institution = person.current_job()[0]
        final_institution = person['followup_location']
        
        # Get average publication rate of the young faculty at either institution
        # This will skip anyone who left academia
        if pubs_by_year_by_inst.has_key(initial_institution) and pubs_by_year_by_inst.has_key(final_institution):
            initial_counts = [count for (_, count) in pubs_by_year_by_inst[initial_institution]]
            initial_scatter, initial_pub_rate, initial_pub_std = sp.stats.lognorm.fit(initial_counts)
            
            final_counts = [count for (_, count) in pubs_by_year_by_inst[final_institution]]
            final_scatter, final_pub_rate, final_pub_std = sp.stats.lognorm.fit(final_counts)
            
            # Get this faculty member's publication rate
            start = person.current_job()[1]
            end = np.min([2012, start + 5])
            career_length = end - start
        
            contribution = 0
            for pub in person.dblp_pubs:
                if 'year' in pub \
                      and pub['year'] >= start \
                      and pub['year'] < end \
                      and pub['pub_type'] in ['inproceedings', 'article']:
                    contribution += inflation_adjust(pub['year'])
            personal_pub_rate = (contribution/float(career_length))
            
            publication_trajectories.append((personal_pub_rate, (initial_scatter, initial_pub_rate, initial_pub_std), (final_scatter, final_pub_rate, final_pub_std)))

In [139]:
# How many times was the final institution a better fit?
final = 0; num_people = 0; 
for (personal, (initial_scatter, initial_mu, initial_sigma), (final_scatter, final_mu, final_sigma)) in publication_trajectories:
    likelihood_of_final = sp.stats.lognorm.pdf(personal, initial_scatter, loc=initial_mu, scale=final_sigma)
    likelihood_of_initial = sp.stats.lognorm.pdf(personal, final_scatter, loc=final_mu, scale=initial_sigma)
    
    #print(likelihood_of_final, likelihood_of_initial)
    if likelihood_of_final >= likelihood_of_initial:
        final += 1
    num_people += 1

In [140]:
final/num_people

0.9208333333333333