In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Specify career len to export file for
# CAREER_LENGTH = 15
# Specify how long is the early career. Impacts which papers we take into account for early productivity and quality
# EARLY_CAREER_LEN = 3
EARLY_CAREER_LEN_LIST = [1, 2, 3, 4, 5]
EARLY_CAREER_LEN_LIST = [3,5,7,9,11,12]
# For early career work, when do we stop counting citations. Impacts recognition
# RECOGNITION_CUT_OFF = 5
RECOGNITION_CUT_OFF_LIST = [3, 4, 5, 6, 7, 8, 9]
RECOGNITION_CUT_OFF_LIST = [3, 5]
RECOGNITION_CUT_OFF_LIST = [3,5,7,9,11,12]
# Success after 15 years. Impacts when we stop counting citations
SUCCESS_CUTOFF = 15
# Length of observed career for dropouts
# (1-3), middle career (4-9), late career (10-15)

# TODO: for multiple dropout intervals code does not work!!!
CAREER_LENGTH_DROPOUTS_LIST = [ (0, 15), (0, 3), (4, 9), (10, 15)] #,
CAREER_LENGTH_DROPOUTS_LIST = [(0, 15)]
# CAREER_LENGTH_DROPOUTS = 15
INACTIVE_TIME_DROPOUTS = 10

# Specify the first and last year we consider in our analysis
# TODO: Should we specify last start year? 
START_YEAR = 1970
# LAST_START_YEAR = 2018 - max(CAREER_LENGTH, CAREER_LENGTH_DROPOUTS)

In [3]:
# assert(INACTIVE_TIME_DROPOUTS < CAREER_LENGTH_DROPOUTS), "Time observed for dropouts has to be smaller than the whole window!"

In [4]:
# assert(CAREER_LENGTH >= EARLY_CAREER_LEN), "Early career len too long"

## 1. Load data

In [5]:
authorPublicationData = pd.read_csv('./data/author_publications_2017_asiansAsNone.txt')

authorPublicationData.head()

Unnamed: 0,author,year,pub_id
0,graeme earl,2011,001c58d3-26ad-46b3-ab3a-c1e557d16821
1,gareth beale,2011,001c58d3-26ad-46b3-ab3a-c1e557d16821
2,m. nasser,2009,001c8744-73c4-4b04-9364-22d31a10dbf1
3,faisal zaman,2009,001c8744-73c4-4b04-9364-22d31a10dbf1
4,altaf hossain,2009,001c8744-73c4-4b04-9364-22d31a10dbf1


In [6]:
print(authorPublicationData.shape)
# same as dropping author, pub_id and year
authorPublicationData.drop_duplicates(subset=['author','pub_id'], inplace=True)
print(authorPublicationData.shape)

(9471668, 3)
(9467149, 3)


In [7]:
authorPublicationData['pub_id'].nunique()

3078230

In [8]:
authorCitationsData = pd.read_csv('./data/citations_2017_asiansAsNone.txt')
authorCitationsData.head()

Unnamed: 0,id1,id2,year
0,1fcd3d7f-1bb7-4347-914e-e23adb1e1a59,5065f1e8-0e2e-4e20-9866-2b5067d321dc,1990
1,1fcd3d7f-1bb7-4347-914e-e23adb1e1a59,cf862fd8-9204-4e44-9639-2d067c247539,1990
2,1fcd3d7f-1bb7-4347-914e-e23adb1e1a59,f19127f8-3d3a-4a9e-9580-c1d3e1e05141,1990
3,03b1d66a-b1fc-4d99-9904-9405695cc2ae,09991de0-c00f-49cf-a88a-6515943b0843,1994
4,03b1d66a-b1fc-4d99-9904-9405695cc2ae,156173f6-35eb-44f6-bc75-7c205153b6f0,1994


In [9]:
print(authorCitationsData.shape)
authorCitationsData.drop_duplicates(inplace=True)
print(authorCitationsData.shape)

(25161834, 3)
(25161834, 3)


In [10]:
print('Authors#      - ',authorPublicationData['author'].nunique())
print('Years#        - ',authorPublicationData['year'].nunique())
print('Publications# - ',authorPublicationData['pub_id'].nunique())

Authors#      -  1704919
Years#        -  83
Publications# -  3078230


In [97]:
# venue data
publication_venues_rank = pd.read_csv('derived-data/publication-venues-rank.csv')

## 2. Career length and dropouts

In [11]:
groupByAuthor = authorPublicationData.groupby(['author'])

groupByAuthorMinYearData = groupByAuthor['year'].min()
groupByAuthorMaxYearData = groupByAuthor['year'].max()
groupByAuthorCountPublicationsData = groupByAuthor['pub_id'].count()

In [12]:
authorGroupedData = groupByAuthorMinYearData.to_frame(name='start_year')
authorGroupedData['end_year'] = groupByAuthorMaxYearData
authorGroupedData['total_num_pub'] = groupByAuthorCountPublicationsData
authorGroupedData = authorGroupedData.reset_index()
print('Total rows -                ', authorGroupedData.shape)

# authorGroupedData = authorGroupedData[authorGroupedData["start_year"] >= START_YEAR]
# print('After removing all < 1970 - ', authorGroupedData.shape)

authorGroupedData = authorGroupedData.drop_duplicates()
print('After removing duplicates - ', authorGroupedData.shape)

authorGroupedData = authorGroupedData.dropna(how='any')
print("After droping na -          ", authorGroupedData.shape)

authorGroupedData.head()

Total rows -                 (1704919, 4)
After removing duplicates -  (1704919, 4)
After droping na -           (1704919, 4)


Unnamed: 0,author,start_year,end_year,total_num_pub
0,'maseka lesaoana,2001,2015,2
1,(max) zong-ming cheng,2009,2011,2
2,(zhou) bryan bai,2011,2012,2
3,a aart blokhuis,1992,2005,2
4,a ahrabian,2017,2017,1


In [13]:
# Adding 1 here to have career length be at least one. So 3 years career means year1, year2, year3.
authorGroupedData["career_length"] = authorGroupedData['end_year'] - authorGroupedData['start_year'] + 1

In [14]:
credible_authors = authorGroupedData

### Label authors that drop out

In [15]:
print(f"Label authors with {INACTIVE_TIME_DROPOUTS} years inacitivity in a {CAREER_LENGTH_DROPOUTS_LIST} years window as dropouts")

Label authors with 10 years inacitivity in a [(0, 15)] years window as dropouts


In [16]:
combined_pubs = authorPublicationData.merge(credible_authors[['author', 'start_year']], on='author', how='inner')

print(combined_pubs.head())
print(combined_pubs.shape)

        author  year                                pub_id  start_year
0  graeme earl  2011  001c58d3-26ad-46b3-ab3a-c1e557d16821        2009
1  graeme earl  2011  6af505cf-727f-47f9-afea-772af2928086        2009
2  graeme earl  2009  3ac1f00f-6830-42e9-b6f9-944f00a71b3c        2009
3  graeme earl  2009  3133714c-f979-4d84-9224-97361cf053ab        2009
4  graeme earl  2014  71d3749b-3e35-461b-86c8-920c42d5ebe8        2009
(9467149, 4)


In [17]:
def list_append(lst, item):
    lst.append(item)
    return lst

In [None]:
%%time
for start, end in CAREER_LENGTH_DROPOUTS_LIST:
    combined_pubs_grouped = combined_pubs[(combined_pubs.year >= combined_pubs.start_year + start) &
                                    (combined_pubs.year <= combined_pubs.start_year + end)]   
    # for every 2 consecutive years the author has published (nxt and prev) find a difference (absence time)
    # we artificially add two value: career start + 15 and career start, as limiters of our observation window
    # this will add 0 values in the begining for the first year
    combined_pubs_grouped = combined_pubs_grouped.groupby('author')['year', 'start_year'].apply(
        lambda x: [nxt - prev for prev, nxt in zip(sorted(list_append(list(x['year']),x['start_year'].iloc[0]+start)), 
                                                   sorted(list_append(list(x['year']),x['start_year'].iloc[0]+  end)))])
    combined_pubs_grouped = combined_pubs_grouped.reset_index()
    combined_pubs_grouped.rename({0:f'absence_list-{start}-{end}'}, inplace=True, axis='columns')
    combined_pubs_grouped[f'max_absence-{start}-{end}'] = combined_pubs_grouped[f'absence_list-{start}-{end}'].apply(max)
    combined_pubs_grouped[f'avg_absence-{start}-{end}'] = combined_pubs_grouped[f'absence_list-{start}-{end}'].apply(np.mean)
    
    credible_authors = credible_authors.merge(combined_pubs_grouped[['author', f'max_absence-{start}-{end}', 
                                                                     f'avg_absence-{start}-{end}']], on='author', how='left')
    credible_authors[f'max_absence-{start}-{end}'] = credible_authors[f'max_absence-{start}-{end}'].fillna(end-start+1)
    credible_authors[f'avg_absence-{start}-{end}'] = credible_authors[f'avg_absence-{start}-{end}'].fillna(end-start+1)
    
    # TODO: Should i also add the start year into the calculation? Now i only have end year included

In [None]:
# display(combined_pubs_grouped.head())
# display(combined_pubs_grouped.shape)

In [None]:
credible_authors['dropped_after_10'] = credible_authors['max_absence-0-15'].apply(lambda x: False if x < 10 else True)

In [None]:
credible_authors['max_absence-0-15'].value_counts(dropna=False)

In [None]:
credible_authors.shape

In [None]:
credible_authors['dropped_after_10'].value_counts()

In [None]:
credible_authors.columns

### Gender

In [18]:
gender = pd.read_csv('./data/name_gender_2017_asiansAsNone_nodup.txt')
credible_authors = credible_authors.merge(gender, left_on='author', right_on='name', how='left')
credible_authors.drop('name', axis=1, inplace=True)

credible_authors.gender.value_counts()

m       796975
none    652151
f       255793
Name: gender, dtype: int64

### Save filtered data about authors, and cleaned publications

In [19]:
credible_authors[credible_authors.start_year >= START_YEAR].to_csv('derived-data/authors-scientific.csv', index=False, encoding='utf-8')
credible_authors.head()

Unnamed: 0,author,start_year,end_year,total_num_pub,career_length,gender
0,'maseka lesaoana,2001,2015,2,15,none
1,(max) zong-ming cheng,2009,2011,2,3,none
2,(zhou) bryan bai,2011,2012,2,2,none
3,a aart blokhuis,1992,2005,2,14,none
4,a ahrabian,2017,2017,1,1,none


In [20]:
authorPublicationData.to_csv('derived-data/author-publications.csv', index=False)

In [96]:
authorPublicationData.shape

(9467149, 3)

## 3. Generate a new citation network

### Generate Author->Paper network

In [21]:
# We need data about how many times an author has been cited
# For every authors publication, i merge all citations
# Doesnt contain uncited papers
final_citation_count_from_ids = authorPublicationData.merge(authorCitationsData, left_on='pub_id', 
                                                            right_on='id2', how='inner', suffixes=('_pub', '_cit'))

In [22]:
print(final_citation_count_from_ids.shape)

(75912535, 6)


In [23]:
final_citation_count_from_ids.drop_duplicates(inplace=True)

In [24]:
print(final_citation_count_from_ids.shape)

(75912535, 6)


#### Remove errors in citation data (years published vs years cited)

In [25]:
# Published before cited - NORMAL
print(final_citation_count_from_ids.shape)
num_normal = final_citation_count_from_ids[final_citation_count_from_ids.year_pub <= final_citation_count_from_ids.year_cit].shape
print(num_normal)

(75912535, 6)
(75392625, 6)


In [26]:
# Published after cited - WRONG
num_wrong = final_citation_count_from_ids[final_citation_count_from_ids.year_pub > final_citation_count_from_ids.year_cit].shape
print(num_wrong)

(519910, 6)


In [27]:
print("Percentage of citations to be removed: ", num_wrong[0]*100/(num_normal[0]+num_wrong[0]))
print("Less than one percent")

Percentage of citations to be removed:  0.6848803033649186
Less than one percent


In [28]:
cit_wrong_df = final_citation_count_from_ids[final_citation_count_from_ids.year_pub > final_citation_count_from_ids.year_cit]

In [29]:
cit_wrong = final_citation_count_from_ids[final_citation_count_from_ids.year_pub > final_citation_count_from_ids.year_cit].index

In [30]:
final_citation_count_from_ids.drop(cit_wrong, inplace=True)

assert num_normal[0] == final_citation_count_from_ids.shape[0], "The number of citations doesnt match"

In [31]:
final_citation_count_from_ids.columns

Index(['author', 'year_pub', 'pub_id', 'id1', 'id2', 'year_cit'], dtype='object')

#### Save

In [32]:
# final_citation_count_from_ids.to_csv('./data/authors_cited_by_papers_2017_asiansAsNone_by_daniel.txt',
#                                      columns=['author', 'year_pub', 'pub_id', 'id1', 'year_cit'], index=False)

final_citation_count_from_ids[['author', 'id1', 'id2', 'year_cit']].drop_duplicates().to_csv('derived-data/author-paper-citations-cleaned.csv', 
                                                                                   index=False)

# final_citation_count_from_ids.drop_duplicates(subset=['author_cited', 'pub_id_cited', 'pub_id_citing', 'author_citing'],
#                                               inplace=True)

In [33]:
# drop duplicates on id1,id2 because we only care about paper->paper citations
paper_citation_count = final_citation_count_from_ids.drop_duplicates(subset=['id1', 'id2']).groupby('id2')['id1'].count()
paper_citation_count.to_csv('derived-data/paper-citation-count.csv')

Its important to keep using this file for citations. As it has bad entries removed

### Group citations over authors and years

In [None]:
citations_year_auth = final_citation_count_from_ids.groupby(['author', 'year_cit'])['id1'].count()


In [None]:
citations_year_auth.head()

In [None]:
citations_year_auth = citations_year_auth.reset_index()
citations_year_auth = citations_year_auth.rename(columns={'id1':'cit_count'})

# citations_year_auth[['author', 'year_cit', 'cit_count']].to_csv('derived-data/authors-perYear-citations-atleast-'+str(CAREER_LENGTH)+'-year.csv', index=False)
citations_year_auth[['author', 'year_cit', 'cit_count']].to_csv('derived-data/authors-perYear-citations.csv', index=False)

In [None]:
citations_year_auth = citations_year_auth.groupby(['author', 'year_cit'])['cit_count'].sum()

## Early career analysis

In [36]:
combined = final_citation_count_from_ids.merge(credible_authors[['author', 'start_year']], on='author', how='inner')
# TODO Is this 'inner' here good?

In [85]:
early_career_publications = authorPublicationData.merge(credible_authors[['author', 'start_year']], on='author', how='left')
# TODO Is this 'left' here good?

In [101]:
print(early_career_publications.author.nunique())
print(early_career_publications.pub_id.nunique())
early_career_publications.shape

1704919
3078230


(9467149, 4)

In [42]:
print(combined.author.nunique())
print(combined.pub_id.nunique())

1151974
1977741


In [39]:
combined.shape

(75392625, 7)

In [None]:
def get_start_year(author):
    return authorPublicationData[(authorPublicationData.author == author)].year.min()

In [None]:
get_start_year("karen sparck jones")

In [37]:
early_career_publications[early_career_publications.start_year.isna()]

Unnamed: 0,author,year,pub_id,start_year


In [None]:
authorPublicationData.head()

In [None]:
combined.shape

In [None]:
final_citation_count_from_ids.shape
# 75392625

In [None]:
combined.head()

In [None]:
combined.drop_duplicates(subset=['author', 'id1', 'id2'], inplace=True)

### Venues

In [104]:
early_career_venues = early_career_publications.merge(publication_venues_rank[
    ['pub_id', 'h5_index', 'ranking', 'deciles', 'quantiles']], on='pub_id', how='inner')

In [None]:
early_career_venues.columns

In [106]:
# for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
EARLY_CAREER = 3
early_career_venues_ec = early_career_venues[early_career_venues.year < early_career_venues.start_year + EARLY_CAREER]
early_career_venues_gr = early_career_venues_ec.groupby('author').agg({
    'h5_index': 'max',
    'ranking': 'min',
    'deciles': 'min',
    'quantiles': 'min'}).rename(columns={
    'h5_index': f'h5_index_{EARLY_CAREER}', 
    'ranking': f'ranking_{EARLY_CAREER}',
    'deciles': f'deciles_{EARLY_CAREER}',
    'quantiles': f'quantiles_{EARLY_CAREER}'})
early_career_venues_gr = early_career_venues_gr.reset_index()
credible_authors = credible_authors.merge(early_career_venues_gr, on='author', how='left')
for col in [f'h5_index_{EARLY_CAREER}', f'ranking_{EARLY_CAREER}', f'deciles_{EARLY_CAREER}', f'quantiles_{EARLY_CAREER}']:
    credible_authors[col] = credible_authors[col].fillna(0)

Unnamed: 0_level_0,h5_index,ranking,deciles,quantiles
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'maseka lesaoana,78.0,77.0,1.0,1.0
(max) zong-ming cheng,165.0,22.0,1.0,1.0
(zhou) bryan bai,28.0,965.0,5.0,2.0
a aart blokhuis,48.0,62.5,1.0,1.0
a el sharkawi,1.0,1541.0,7.0,3.0
a mete,124.0,54.5,1.0,1.0
a raghavendra,66.0,223.5,2.0,1.0
a stewart,30.0,211.0,2.0,1.0
a swiercz,30.0,891.5,5.0,2.0
a'ang subiyakto,18.0,1018.5,5.0,2.0


### Early degree

In [None]:
for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
    combined_early_degree = combined[(combined.year_pub < combined.start_year + EARLY_CAREER)]

    combined_early_degree = combined_early_degree.drop_duplicates(subset=['author', 'pub_id'])

    combined_early_degree = combined_early_degree[['author', 'pub_id']]

    # authors_per_paper = combined_early_degree.groupby('pub_id')['author'].count().reset_index()
    # authors_per_paper.rename({"author":"early_career_degree"}, axis='columns', inplace=True)

    combined_early_degree = combined_early_degree.merge(combined, on='pub_id')

    combined_early_degree = combined_early_degree[combined_early_degree.author_x != combined_early_degree.author_y]
    combined_early_degree = combined_early_degree.drop_duplicates(subset=['author_x', 'author_y'])

    combined_early_degree = combined_early_degree.groupby('author_x')['author_y'].count().reset_index()

    combined_early_degree.rename({"author_x":"author", "author_y": f"early_career_degree_{EARLY_CAREER}"}, 
                                 axis='columns', inplace=True)

    credible_authors = credible_authors.merge(combined_early_degree, on='author', how='left')
    credible_authors[f"early_career_degree_{EARLY_CAREER}"] = credible_authors[f"early_career_degree_{EARLY_CAREER}"].fillna(0)

In [None]:
combined_early_degree.sample(10)

### Early quality

In [None]:
for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
    combined_early_quality = combined[(combined.year_pub < combined.start_year + EARLY_CAREER) &
             (combined.year_cit < combined.start_year + SUCCESS_CUTOFF)]

    early_career_quality = combined_early_quality.groupby('author')['id1'].count()

    early_career_quality = early_career_quality.rename(f'early_career_qual_{EARLY_CAREER}')
    early_career_quality = early_career_quality.reset_index()
    credible_authors = credible_authors.merge(early_career_quality, on='author', how='left')
    credible_authors[f'early_career_qual_{EARLY_CAREER}'] = credible_authors[f'early_career_qual_{EARLY_CAREER}'].fillna(0)

### Early recognition

In [None]:
for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
    for RECOGNITION_CUT in RECOGNITION_CUT_OFF_LIST:
        if RECOGNITION_CUT != EARLY_CAREER: continue
        early_career_recognition = combined[(combined.year_pub < combined.start_year + EARLY_CAREER) &
                 (combined.year_cit < combined.start_year + RECOGNITION_CUT)]
        early_career_recognition = early_career_recognition.groupby('author')['id1'].count()
        col_name = f'early_career_recognition_EC{EARLY_CAREER}_RC{RECOGNITION_CUT}'
        early_career_recognition = early_career_recognition.rename(col_name)
        early_career_recognition = early_career_recognition.reset_index()
        credible_authors = credible_authors.merge(early_career_recognition, on='author', how='left')
        credible_authors[col_name] = credible_authors[col_name].fillna(0)

In [None]:
credible_authors.columns

### Final success

In [None]:
combined_succ_after_15y = combined[combined.year_cit < combined.start_year + SUCCESS_CUTOFF]

In [None]:
succ_after_15y = combined_succ_after_15y.groupby('author')['id1'].count()

succ_after_15y = succ_after_15y.rename('succ_after_15y')
succ_after_15y = succ_after_15y.reset_index()
credible_authors = credible_authors.merge(succ_after_15y, on='author', how='left')
credible_authors['succ_after_15y'] = credible_authors['succ_after_15y'].fillna(0)

### H index

In [None]:
def h_index(citations):
    if len(citations) == 0: return 0
    if len(citations) == 1: return 1
    citations = sorted(citations, reverse=True)
    h_ind = 0
    for i, elem in enumerate(citations):
        if i+1 > elem:
            return i
        h_ind = i+1
    return h_ind

In [None]:
for param in [*EARLY_CAREER_LEN_LIST, SUCCESS_CUTOFF]:

    combined_h_index = combined[combined.year_cit < combined.start_year + param]

    combined_h_index = combined_h_index.groupby(['author', 'pub_id'])['id1'].count()

    combined_h_index = combined_h_index.reset_index()

    combined_h_index = combined_h_index.groupby('author')['id1'].apply(lambda x: h_index(x.values))

    combined_h_index = combined_h_index.rename(f'h-index_{param}')

    credible_authors = credible_authors.merge(combined_h_index.reset_index(), on='author', how='left')
    credible_authors[f'h-index_{param}'] = credible_authors[f'h-index_{param}'].fillna(0)

In [None]:
# TODO: test h-index

In [92]:
%%time
for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
    early_career_publications_reduced = early_career_publications[early_career_publications.year <= 
                                                       early_career_publications.start_year + EARLY_CAREER]
    early_career_publications_ = early_career_publications_reduced.groupby('author').agg({'pub_id': 'nunique'}).reset_index()
    early_career_publications_ = early_career_publications_.rename({'pub_id':f'early_career_prod_{EARLY_CAREER}'}, axis='columns')
    credible_authors = credible_authors.merge(early_career_publications_, on='author', how='left')

3
5
7
9
11
12
CPU times: user 4min 37s, sys: 20.4 s, total: 4min 57s
Wall time: 2min 8s


### Early Coauthor max h-index

In [None]:
# for each paper in EC, calculate the h-index of all its authors
# This requires extra work
# We want to calculate the h index of coauthors at the time of publishing the paper
# for this we need an extra lookup table, where we store 
# all papers - authors - h-index at the time
# 

# final_citation_count_from_ids - we merge pub data with cit data, but "inner"
# this means we will not find papers with 0 citations in this df
# these papers dont impact the h-index, so this is okay

In [None]:
def author_h_index_in_year_X(authors, year_x):
#     print(year_x)
    combined_h = combined[(combined.year_cit < year_x) & (combined.author.isin(authors))]
    combined_h = combined_h.groupby(['author', 'pub_id']).agg({'id1': 'count'}).reset_index()
    author_hind_at_year = combined_h.groupby('author').agg({'id1': h_index}).reset_index()
    author_hind_at_year['year_pub'] = year_x
    author_hind_at_year = author_hind_at_year.rename({'id1': 'h-index'}, axis='columns')
    return author_hind_at_year

In [None]:
def author_h_index(author, year_x):
    combined_h = combined[(combined.year_cit < year_x) & (combined.author == author)]
    citations_count_list = combined_h.groupby(['pub_id']).agg({'id1': 'count'})['id1'].values
    return h_index(citations_count_list)

In [None]:
%%time
papers_authors = combined[['author', 'year_pub']].drop_duplicates(subset=['author', 'year_pub'])

In [None]:
%%time
all_authors_hind = pd.DataFrame(columns=['author', 'h-index', 'year_pub'])
all_authors_hind['year_pub'] = all_authors_hind['year_pub'].astype('int64')
for year_x in papers_authors.year_pub.unique():
    authors = papers_authors[papers_authors.year_pub == year_x].author.values
    author_hind_at_year = author_h_index_in_year_X(authors, year_x)
    all_authors_hind = all_authors_hind.append(author_hind_at_year)

In [None]:
papers_authors = papers_authors.merge(all_authors_hind, how='left')

In [None]:
papers_authors['h-index'] = papers_authors['h-index'].fillna(0)

In [None]:
for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
    combined_early_coauthor = combined[(combined.year_pub < combined.start_year + EARLY_CAREER)]

    combined_early_coauthor = combined_early_coauthor.drop_duplicates(subset=['author', 'pub_id'])

    combined_early_coauthor = combined_early_coauthor[['author', 'pub_id']]

    # merging with self here removes coauthors that are not in their early career
    combined_early_coauthor = combined_early_coauthor.merge(combined, on='pub_id')

    combined_early_coauthor = combined_early_coauthor[combined_early_coauthor.author_x != combined_early_coauthor.author_y]
    combined_early_coauthor = combined_early_coauthor.drop_duplicates(subset=['author_x', 'author_y'])

    #here i merge the h-index_15. TODO: Replace this with h_index_in_year_of_publishing
    # combined_early_coauthor = combined_early_coauthor.merge(credible_authors[['author', 'h-index_15']], left_on='author_y', right_on='author')

    # NEW NEW NEW
    combined_early_coauthor = combined_early_coauthor.merge(papers_authors, left_on=['author_y', 'year_pub'], 
                                                            right_on=['author', 'year_pub'])
    #####

    combined_early_coauthor = combined_early_coauthor.groupby('author_x')['h-index'].max().reset_index()

    combined_early_coauthor.rename({"author_x":"author", "h-index": f"early_career_coauthor_max_hindex_{EARLY_CAREER}"}, 
                                 axis='columns', inplace=True)

    combined_early_coauthor = combined_early_coauthor[['author', f"early_career_coauthor_max_hindex_{EARLY_CAREER}"]]

    credible_authors = credible_authors.merge(combined_early_coauthor, on='author', how='left')
    credible_authors[f"early_career_coauthor_max_hindex_{EARLY_CAREER}"] = credible_authors[f"early_career_coauthor_max_hindex_{EARLY_CAREER}"].fillna(0)

In [None]:
credible_authors.columns

### Early Coauthor max citations

In [None]:
# for EARLY_CAREER in EARLY_CAREER_LEN_LIST:
#     combined_early_coauthor = combined[(combined.year_pub < combined.start_year + EARLY_CAREER)]

#     combined_early_coauthor = combined_early_coauthor.drop_duplicates(subset=['author', 'pub_id'])

#     combined_early_coauthor = combined_early_coauthor[['author', 'pub_id']]

#     combined_early_coauthor = combined_early_coauthor.merge(combined, on='pub_id')

#     combined_early_coauthor = combined_early_coauthor[combined_early_coauthor.author_x != combined_early_coauthor.author_y]
#     combined_early_coauthor = combined_early_coauthor.drop_duplicates(subset=['author_x', 'author_y'])

#     combined_early_coauthor = combined_early_coauthor.merge(credible_authors[['author', 'succ_after_15y']], left_on='author_y', right_on='author')
#     combined_early_coauthor = combined_early_coauthor.groupby('author_x')['succ_after_15y'].max().reset_index()

#     combined_early_coauthor.rename({"author_x":"author", "succ_after_15y": f"early_career_coauthor_max_cit_{EARLY_CAREER}"}, 
#                                  axis='columns', inplace=True)

#     combined_early_coauthor = combined_early_coauthor[['author', f"early_career_coauthor_max_cit_{EARLY_CAREER}"]]

#     credible_authors = credible_authors.merge(combined_early_coauthor, on='author', how='left')
#     credible_authors[f"early_career_coauthor_max_cit_{EARLY_CAREER}"] = credible_authors[f"early_career_coauthor_max_cit_{EARLY_CAREER}"].fillna(0)

In [None]:
# test - sample 50 authors and calculate data by hand, compare to sampled values to see if the calculation is good
# TODO This test is outdated! Make a new test to reflect new way of calculation

In [None]:
# combined_test = combined[(combined.year_pub < combined.start_year + 3)]

# test_authors = credible_authors.sample(50)['author'].values

# i = 1

# for test_author in test_authors:

#     test_author_data = credible_authors[credible_authors.author == test_author][
#         ['author','early_career_degree_3', 'h-index_15', 'early_career_coauthor_max_hindex_3', 'early_career_coauthor_max_cit_3']]

#     papers = combined_test[combined_test.author == test_author]['pub_id'].unique()

#     authors = authorPublicationData[authorPublicationData.pub_id.isin(papers)].author
    
#     coauth_data = credible_authors[credible_authors.author.isin(authors)][['author', 'h-index_15', 'succ_after_15y']]
    
#     coauth_data = coauth_data[~coauth_data.author.isin(test_author_data['author'])]
    
#     assert test_author_data['early_career_degree_3'].item() == coauth_data.shape[0], \
#     f'Degree problem, expected {coauth_data.shape[0]}, got {test_author_data["early_career_degree_3"].item()}'
    
#     assert test_author_data['early_career_coauthor_max_hindex_3'].item() == max(coauth_data["h-index_15"], default=0), \
#     f'H index problem, expected {max(coauth_data["h-index_15"], default=0)}, got {test_author_data["early_career_coauthor_max_hindex_3"].item()}'
    
#     assert test_author_data['early_career_coauthor_max_cit_3'].item() == max(coauth_data["succ_after_15y"], default=0), \
#     f'15y success problem, expected {max(coauth_data["succ_after_15y"], default=0)}, got {test_author_data["early_career_coauthor_max_cit_3"].item()}'
    
#     if i%10 == 0: 
#         print(f"Passed test {i}")
#     i += 1

In [None]:
# drop
def drop_list_cols(drop_list):
    credible_authors.drop(drop_list, axis=1, inplace=True)

## Save

In [None]:
# credible_authors.to_csv('derived-data/authors-scientific-atleast-'+str(CAREER_LENGTH)+'-year-extended.csv',
#                     index=False, encoding='utf-8')
credible_authors.to_csv('derived-data/authors-scientific-extended.csv',
                    index=False, encoding='utf-8')

In [None]:
credible_authors.columns