In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import timeit
import time
import seaborn as sns

#these are user defined modules

## Consider all authors
#### Filtering can be done on the final data for credible authors

In [2]:
# Authors with Career length 
CAREER_LENGTH_LIST = [0,5,10,15,20,25]
CAREER_LENGTH = 0

authorStartEndCareerData = pd.read_csv('./data/authors-scientific-atleast-'+str(CAREER_LENGTH)+'-year.csv')
print(authorStartEndCareerData.head())
print(authorStartEndCareerData.shape)

# print('Filter to reduce data for estimate time')
# authorStartEndCareerData = authorStartEndCareerData[authorStartEndCareerData['career_length'] >= 45]
# print(authorStartEndCareerData.head())
# print(authorStartEndCareerData.shape)

                  author  start_year  end_year  total_num_pub  career_length
0       'maseka lesaoana        2001      2015              2             14
1  (max) zong-ming cheng        2009      2011              2              2
2       (zhou) bryan bai        2011      2012              2              1
3                  a lun        2010      2010              1              0
4             a min tjoa        1979      2015            193             36
(1708185, 5)


In [3]:
finalData = pd.DataFrame(columns=['author','start_year'], data=authorStartEndCareerData[['author','start_year']])
# career length is important because it will help us to filter credible authors
# Career length is incremented by 1 because the previous data was calculated by just subtracting start_year from end_year.
# so, if author finishes his career on the same year he/she started then it will be 0 (instead value 1 is quite appropriate)
finalData['career_length']= authorStartEndCareerData['career_length'] + 1

print(finalData.head())
print(finalData[finalData['author'] == "'maseka lesaoana"])
print(finalData.shape)

                  author  start_year  career_length
0       'maseka lesaoana        2001             15
1  (max) zong-ming cheng        2009              3
2       (zhou) bryan bai        2011              2
3                  a lun        2010              1
4             a min tjoa        1979             37
             author  start_year  career_length
0  'maseka lesaoana        2001             15
(1708185, 3)


In [4]:
genderData = pd.read_csv('./data/name_gender_1970_2016_noAsians.csv')
genderData.rename(columns={'name':'author'}, inplace=True)
print(genderData.head())
print(genderData.shape)

             author gender
0    nima mesgarani      m
1    james j. abbas      m
2  eberhart zrenner      m
3    emery n. brown      m
4        ilya rybak      m
(5793845, 2)


In [5]:
finalData = pd.merge(finalData, genderData, how="left", on=["author"])
# for authors whose gender not found, replace it with none
finalData['gender'] = finalData['gender'].fillna('none')
print(finalData.head())

print("finalData with duplicates: %s", finalData.shape[0])
finalData = finalData.drop_duplicates()
print("finalData no duplicates: %s", finalData.shape[0])
print('Gender  - ',finalData['gender'].unique())

finalData = finalData.set_index(['author'])
print(finalData.head())
print(finalData.shape)

                  author  start_year  career_length gender
0       'maseka lesaoana        2001             15   none
1       'maseka lesaoana        2001             15   none
2  (max) zong-ming cheng        2009              3   none
3       (zhou) bryan bai        2011              2   none
4                  a lun        2010              1   none
('finalData with duplicates: %s', 6372557)
('finalData no duplicates: %s', 1708185)
('Gender  - ', array(['none', 'f', 'm'], dtype=object))
                       start_year  career_length gender
author                                                 
'maseka lesaoana             2001             15   none
(max) zong-ming cheng        2009              3   none
(zhou) bryan bai             2011              2   none
a lun                        2010              1   none
a min tjoa                   1979             37   none
(1708185, 3)


In [6]:
publicationData = pd.read_csv('./data/author_publications_1970_2016_asiansAsNone.txt')
print(publicationData.head())
print(publicationData.shape)

print('Filter to reduce data for estimate time')
publicationData = publicationData[publicationData['author'].isin(finalData.index.values)]
print(publicationData.head())
print(publicationData.shape)

             author  year                    pub_id
0  kim l. blackwell  2014  55503da645ce0a409eb273e8
1    nima mesgarani  2014  55503da645ce0a409eb273e9
2    james j. abbas  2014  55503da645ce0a409eb273ea
3  eberhart zrenner  2014  55503da645ce0a409eb273ec
4    jonathan rubin  2014  55503da645ce0a409eb273e0
(9078240, 3)
Filter to reduce data for estimate time
             author  year                    pub_id
0  kim l. blackwell  2014  55503da645ce0a409eb273e8
1    nima mesgarani  2014  55503da645ce0a409eb273e9
2    james j. abbas  2014  55503da645ce0a409eb273ea
3  eberhart zrenner  2014  55503da645ce0a409eb273ec
4    jonathan rubin  2014  55503da645ce0a409eb273e0
(9078240, 3)


In [7]:
citationData = pd.read_csv('./data/citations_1970_2016_asiansAsNone.txt')
print(citationData.head())
# publication with id1 cited the publication with id2 in the year 'year'

# We cannot use the author citation data which is in 'authors_cite_2016_asiansAsNone.txt' file because we will not know for 
# what paper the author has cited other author in a certain year
print(citationData.shape)


                        id1                       id2  year
0  53e9aa9cb7602d97034143d8  53e9aebdb7602d97038e692d  1994
1  53e99ca1b7602d9702555df6  53e9be3db7602d9704afc6b4  2006
2  53e99ca1b7602d9702555df6  53e9b9a0b7602d970458eee4  2006
3  53e99ca1b7602d9702555df6  53e9bc05b7602d9704866048  2006
4  53e9b166b7602d9703bebb75  53e9bba1b7602d97047ebb5d  2006
(7849398, 3)


# Merge Publication and Citation data

In [8]:
publicationWithCitationData = pd.merge(left=publicationData, right=citationData[['id2', 'year']], how='left', \
                                       left_on='pub_id', right_on='id2')
print(publicationWithCitationData.head())
publicationWithCitationData.rename(columns={'year_x':'pub_year','year_y':'cited_year'}, inplace=True)
#cmpData = publicationWithCitationData['pub_id'] == publicationWithCitationData['id2']
#print(cmpData.unique()) 
## both the columns 'pub_id' and 'id2' represent the same and so can be removed
publicationWithCitationData = publicationWithCitationData.drop(['id2'],axis=1)
print(publicationWithCitationData.head())

print(publicationWithCitationData.shape)

             author  year_x                    pub_id  id2  year_y
0  kim l. blackwell    2014  55503da645ce0a409eb273e8  NaN     NaN
1    nima mesgarani    2014  55503da645ce0a409eb273e9  NaN     NaN
2    james j. abbas    2014  55503da645ce0a409eb273ea  NaN     NaN
3  eberhart zrenner    2014  55503da645ce0a409eb273ec  NaN     NaN
4    jonathan rubin    2014  55503da645ce0a409eb273e0  NaN     NaN
             author  pub_year                    pub_id  cited_year
0  kim l. blackwell      2014  55503da645ce0a409eb273e8         NaN
1    nima mesgarani      2014  55503da645ce0a409eb273e9         NaN
2    james j. abbas      2014  55503da645ce0a409eb273ea         NaN
3  eberhart zrenner      2014  55503da645ce0a409eb273ec         NaN
4    jonathan rubin      2014  55503da645ce0a409eb273e0         NaN
(27465991, 4)


In [9]:
publicationWithCitationData[publicationWithCitationData['author'] == 'abraham berman']

Unnamed: 0,author,pub_year,pub_id,cited_year
5971237,abraham berman,2003,53e9aa66b7602d97033da2f6,
11790216,abraham berman,1988,53e9ac12b7602d97035d67d0,2010.0
11790217,abraham berman,1988,53e9ac12b7602d97035d67d0,2002.0
15631866,abraham berman,1993,53e9ac54b7602d9703625cc2,2012.0
18181342,abraham berman,1971,573695d36e3b12023e4eb4d2,
18804844,abraham berman,2009,53e9ae3cb7602d9703851a3d,
19568527,abraham berman,2001,53e99bdcb7602d970248659e,2008.0
19568528,abraham berman,2001,53e99bdcb7602d970248659e,2005.0
19568529,abraham berman,2001,53e99bdcb7602d970248659e,2014.0
19568530,abraham berman,2001,53e99bdcb7602d970248659e,2014.0


####  Data Extraction process

In [10]:
#print('This should get printed first. What is wrong? Why is not printing????????')

i = 0

CAREER_LENGTH = 15
# 3 consequtive years. Same year when the career starts (it is representd by career age 1) and next two years
#Arranged in desending so that same data can be filtered in the sucessive iteration - which could be a potential time saver
#PUBLICATION_RECORDING_YEAR_LIST = [3, 2, 1]
#CITATION_RECORDING_YEAR_LIST = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] 

PUBLICATION_RECORDING_YEAR_LIST = [3]
CITATION_RECORDING_YEAR_LIST = [14, 1] 

# the total citations that has been obtained for top 3 papers is derived
NO_OF_TOP_CITATIONS = 3

'''
column_names = ['author', 
                'npub_yr1','ncit_yr1_pub_yr1','ncit_yr2_pub_yr1', 'ncit_yr3_pub_yr1', 'ncit_yr15_pub_yr1',
                'npub_yr2','ncit_yr1_pub_yr2','ncit_yr2_pub_yr2', 'ncit_yr3_pub_yr2', 'ncit_yr15_pub_yr2',
                'npub_yr3','ncit_yr1_pub_yr3','ncit_yr2_pub_yr3', 'ncit_yr3_pub_yr3', 'ncit_yr15_pub_yr3',
                'npub_yr15','ncit_yr15_pub_yr15', 'ncit_yr15_top_3_pub'
               ]

column_names = ['author']
for pub_yr in PUBLICATION_RECORDING_YEARS:
    temp = 'npub_yr'
    column_names.append(temp+str(pub_yr))
    for cit_yr in CITATION_RECORDING_YEAR_LIST:
        temp1 = 'ncit_yr'+str(pub_yr+cit_yr-1)+'_pub_yr'+str(pub_yr)
        column_names.append(temp1)
    column_names.append('ncit_yr'+str(CAREER_LENGTH)+'_pub_yr'+str(pub_yr))

column_names.append('npub_yr'+str(CAREER_LENGTH))
column_names.append('ncit_yr'+str(CAREER_LENGTH)+'_pub_yr'+str(CAREER_LENGTH))
column_names.append('ncit_yr'+str(CAREER_LENGTH)+'_top_'+str(NO_OF_TOP_CITATIONS)+'_pub')

print(column_names)
'''
print(finalData.shape)
start_time = time.time()

##filter the data with publications only limiting upto career age of size CAREER_LENGTH
dataGroupByAuthor = publicationWithCitationData.groupby(['author'], sort=False)
pubDataMaxCareerAge = dataGroupByAuthor.apply(lambda group: group[group['pub_year'] <= (group['pub_year'].min() + CAREER_LENGTH - 1)])
tmpName = 'npub_yr'+str(CAREER_LENGTH)
#compute the no. of publications at the end of CAREER_LENGTH
#This contains the citation information as well and so unique publication ids are taken
finalData[tmpName] = pubDataMaxCareerAge[['author','pub_id']].groupby(['author'], sort=False)['pub_id'].nunique()

# print('At the beginning')
# print(pubDataMaxCareerAge[pubDataMaxCareerAge['author'] == 'abraham berman'])

##filter the data with citations only limiting up to career age of size CAREER_LENGTH for publications only within CAREER_LENGTH
dataGroupByAuthor= pubDataMaxCareerAge.groupby(['author'], sort=False)
citDataMaxCareerAge = dataGroupByAuthor.apply(lambda group: group[group['cited_year'] <= (group['pub_year'].min() + CAREER_LENGTH - 1)])
tmpName = 'ncit_yr'+str(CAREER_LENGTH)+'_pub_yr'+str(CAREER_LENGTH)
#compute the no. of citations at the end of CAREER_LENGTH
#function '.count()' is used since certain publications will not have citations at all and they are ignored
# To account for that '.size()' should be used
finalData[tmpName] = citDataMaxCareerAge[['author','cited_year']].groupby(['author'], sort=False)['cited_year'].count()

##compute the total no. of citations obtained for top (3)NO_OF_TOP_CITATIONS papers
#Get the count of citations for each paper and for each author and sorted in ascending order
#Consider only the first (3) citations and compute the sum for each author
temp = citDataMaxCareerAge[['author','pub_id','cited_year']].groupby(['author','pub_id']).count()
temp = temp.sort_values(by=['cited_year'], ascending=False).reset_index()
temp = temp.groupby(['author']).nth(range(NO_OF_TOP_CITATIONS)).reset_index()
tmpName = 'ncit_yr'+str(CAREER_LENGTH)+'_top_'+str(NO_OF_TOP_CITATIONS)+'_pub'
finalData[tmpName] = temp.groupby(['author']).sum()

# publication data is chosen instead of citation data (must have been a good choice)
# But while filtering citation data - we miss out publications which is not cited at all
tmpPubData = pubDataMaxCareerAge

# print('At the beginning - after 15yr citations')
# print(tmpPubData[tmpPubData['author'] == 'abraham berman'])

#the idea is to use the same data which is initialy filtered for (15) years and 
# then for (3), (2) and (1) year respectively as specified in PUBLICATION_RECORDING_YEAR_LIST
for pub_yr in PUBLICATION_RECORDING_YEAR_LIST:
    #filter the data for the respective career age from when their career started and compute no. of publication so far
    
    print('For publication year - ',pub_yr)
    tmpGroup = tmpPubData.groupby(['author'], sort=False)
    tmpPubData = tmpGroup.apply(lambda group: group[group['pub_year'] <= (group['pub_year'].min() + pub_yr - 1)])
    tmpName = 'npub_yr'+str(pub_yr)
    finalData[tmpName] = tmpPubData[['author','pub_id']].groupby(['author'], sort=False)['pub_id'].nunique()
    
#     print('After 1yr publication 15yr citations')
#     print(tmpPubData[tmpPubData['author'] == 'abraham berman'])
    
    #take the filtered data based on their publication and compute citation obtained up untill 
    #for (15),(3),(2) and (1) respectively as specified in CITATION_RECORDING_YEAR_LIST
    tmpGroup= tmpPubData.groupby(['author'], sort=False)
    tmpCitData = tmpGroup.apply(lambda group: group[group['cited_year'] <= (group['pub_year'].min() + CAREER_LENGTH - 1)])
    tmpName = 'ncit_yr'+str(CAREER_LENGTH)+'_pub_yr'+str(pub_yr)
    finalData[tmpName] = tmpCitData[['author','cited_year']].groupby(['author'], sort=False)['cited_year'].count() 
    
    for cit_yr in CITATION_RECORDING_YEAR_LIST:
        #Initially the filtered citation was iteratively used But this results in ERROR
        # because filtering using citation details removes publications which does not have citation at all
        #In that case, if the publications in author's first year does not get citations then 'group['pub_year'].min()' will not
        # give the start year
        
        # if the combination of publication and citation year goes beyond the expected CAREER_AGE then it can be ignored
        if (cit_yr - 1 + pub_yr) < CAREER_LENGTH:
        
            print('For citation year - ',cit_yr)
            tmpGroup= tmpPubData.groupby(['author'], sort=False)
            tmpCitData = tmpGroup.apply(lambda group: group[group['cited_year'] <= (group['pub_year'].min() + pub_yr - 1 + cit_yr - 1)])
            tmpName = 'ncit_yr'+str(cit_yr - 1 + pub_yr)+'_pub_yr'+str(pub_yr)
            finalData[tmpName] = tmpCitData[['author','cited_year']].groupby(['author'], sort=False)['cited_year'].count()


#print(finalData.head())
end_time = time.time()
print((end_time-start_time), 'seconds')

(1708185, 3)
('For publication year - ', 3)
('For citation year - ', 1)
(12353.616490125656, 'seconds')


In [11]:
#If no data found for any of the constraints specified then NaN is copied. So it is important to substitute with 0
finalData.fillna(0, inplace=True)
finalData.to_csv('./data/author-career-short-snapshot.csv')

In [12]:
print((end_time-start_time), 'seconds')

(128255.56474614143, 'seconds')


#### Have a look at 'cumadv-vs-sacspark-data-test' notebook to get an understanding of how it is tested

In [28]:
finalData.head(10)

Unnamed: 0_level_0,start_year,career_length,gender,npub_yr15,ncit_yr15_pub_yr15,ncit_yr15_top_3_pub,npub_yr3,ncit_yr15_pub_yr3,ncit_yr5_pub_yr3,ncit_yr4_pub_yr3,...,npub_yr2,ncit_yr15_pub_yr2,ncit_yr4_pub_yr2,ncit_yr3_pub_yr2,ncit_yr2_pub_yr2,npub_yr1,ncit_yr15_pub_yr1,ncit_yr3_pub_yr1,ncit_yr2_pub_yr1,ncit_yr1_pub_yr1
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'maseka lesaoana,2001,15,none,2,9,9,1,9,1,1,...,1,9,1,1,1,1,9,1,1,0
(max) zong-ming cheng,2009,3,none,2,0,0,2,0,0,0,...,1,0,0,0,0,1,0,0,0,0
(zhou) bryan bai,2011,2,none,2,0,0,2,0,0,0,...,2,0,0,0,0,1,0,0,0,0
a lun,2010,1,none,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
a min tjoa,1979,37,none,16,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
a'fza shafie,2015,1,none,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
a'zraa afhzan ab rahim,2011,1,none,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
a-chuan hsueh,1986,3,f,2,1,1,2,1,0,0,...,1,0,0,0,0,1,0,0,0,0
a-imam al-sammak,1992,17,m,5,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
a-nasser ansari,2003,7,m,10,30,24,4,28,11,7,...,2,11,7,4,0,2,11,4,0,0
