In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import timeit
import time
import seaborn as sns
import random

#these are user defined modules

In [70]:
# Authors with Career length 
CAREER_LENGTH_LIST = [0,5,10,15,20,25]
CAREER_LENGTH = 0

authorStartEndCareerData = pd.read_csv('./data/authors-scientific-atleast-'+str(CAREER_LENGTH)+'-year.csv')
print(authorStartEndCareerData.head())
print(authorStartEndCareerData.shape)

finalData = pd.DataFrame(columns=['author','start_year'], data=authorStartEndCareerData[['author','start_year']])
# career length is important because it will help us to filter credible authors
# Career length is incremented by 1 because the previous data was calculated by just subtracting start_year from end_year.
# so, if author finishes his career on the same year he/she started then it will be 0 (instead value 1 is quite appropriate)
finalData['career_length']= authorStartEndCareerData['career_length'] + 1
print(finalData.head())
print(finalData[finalData['author'] == "'maseka lesaoana"])
print(finalData.shape)

genderData = pd.read_csv('./data/name_gender_1970_2016_noAsians.csv')
genderData.rename(columns={'name':'author'}, inplace=True)
print(genderData.head())
print(genderData.shape)

finalData = pd.merge(finalData, genderData, how="left", on=["author"])
# for authors whose gender not found, replace it with none
finalData['gender'] = finalData['gender'].fillna('none')
print(finalData.head())

print("finalData with duplicates: %s", finalData.shape[0])
finalData = finalData.drop_duplicates()
print("finalData no duplicates: %s", finalData.shape[0])
print('Gender  - ',finalData['gender'].unique())

finalData = finalData.set_index(['author'])
print(finalData.head())
print(finalData.shape)

publicationData = pd.read_csv('./data/author_publications_1970_2016_asiansAsNone.txt')
print(publicationData.head())
print(publicationData.shape)

print('Filter to reduce data for estimate time')
publicationData = publicationData[publicationData['author'].isin(finalData.index.values)]
print(publicationData.head())
print(publicationData.shape)

citationData = pd.read_csv('./data/citations_1970_2016_asiansAsNone.txt')
print(citationData.head())
# publication with id1 cited the publication with id2 in the year 'year'

# We cannot use the author citation data which is in 'authors_cite_2016_asiansAsNone.txt' file because we will not know for 
# what paper the author has cited other author in a certain year
print(citationData.shape)

publicationWithCitationData = pd.merge(left=publicationData, right=citationData[['id2', 'year']], how='left', \
                                       left_on='pub_id', right_on='id2')
print(publicationWithCitationData.head())
publicationWithCitationData.rename(columns={'year_x':'pub_year','year_y':'cited_year'}, inplace=True)
#cmpData = publicationWithCitationData['pub_id'] == publicationWithCitationData['id2']
#print(cmpData.unique()) 
## both the columns represent the same and so can be removed
publicationWithCitationData = publicationWithCitationData.drop(['id2'],axis=1)
print(publicationWithCitationData.head())

print(publicationWithCitationData.shape)

                  author  start_year  end_year  total_num_pub  career_length
0       'maseka lesaoana        2001      2015              2             14
1  (max) zong-ming cheng        2009      2011              2              2
2       (zhou) bryan bai        2011      2012              2              1
3                  a lun        2010      2010              1              0
4             a min tjoa        1979      2015            193             36
(1708185, 5)
                  author  start_year  career_length
0       'maseka lesaoana        2001             15
1  (max) zong-ming cheng        2009              3
2       (zhou) bryan bai        2011              2
3                  a lun        2010              1
4             a min tjoa        1979             37
             author  start_year  career_length
0  'maseka lesaoana        2001             15
(1708185, 3)
             author gender
0    nima mesgarani      m
1    james j. abbas      m
2  eberhart zrenner      

In [71]:
#savedData = pd.read_csv('./data/author-career-short-snapshot-Copy1.csv')
#savedData = pd.read_csv('./data/author-career-short-snapshot.csv')

savedData = pd.read_csv('./data/failed-author-career-short-snapshot.csv')
print(savedData.head(10))
print(savedData.columns)

                 author  start_year  career_length gender  npub_yr15  \
0        anwar al hamra        2003              7      m          8   
1          chia-ying li        2005             12   none         10   
2        gyoo-yong chae        2003              3   none          5   
3        ruben gonzalez        1998             18   none         25   
4  sachin s. sapatnekar        1991             26      m        123   
5        sean j. taylor        2011              5      m          3   
6         somnath mitra        2009              4      m          4   
7          tobias graml        2007              5      m          4   

   ncit_yr15_pub_yr15  ncit_yr15_top_3_pub  npub_yr3  ncit_yr15_pub_yr3  \
0                  17                   16         4                  5   
1                  16                   15         3                  3   
2                   1                    1         5                  1   
3                  15                   12        1

In [18]:
print(savedData.columns)

Index([u'author', u'start_year', u'career_length', u'gender', u'npub_yr15',
       u'ncit_yr15_pub_yr15', u'ncit_yr15_top_3_pub', u'npub_yr3',
       u'ncit_yr15_pub_yr3', u'ncit_yr5_pub_yr3', u'ncit_yr4_pub_yr3',
       u'ncit_yr3_pub_yr3', u'npub_yr2', u'ncit_yr15_pub_yr2',
       u'ncit_yr4_pub_yr2', u'ncit_yr3_pub_yr2', u'ncit_yr2_pub_yr2',
       u'npub_yr1', u'ncit_yr15_pub_yr1', u'ncit_yr3_pub_yr1',
       u'ncit_yr2_pub_yr1', u'ncit_yr1_pub_yr1'],
      dtype='object')


### Iterative approach to calculate the parameters
The estimated time to calculate is 20 days.

In [13]:
randomSample = random.sample(finalData.index, 10)
for specificAuthorIndex in randomSample:
    specificAuthor = finalData.loc[specificAuthorIndex]
    authorName = specificAuthor[0] 
    startYear = specificAuthor[1] 
    #careerLength = specificAuthor[2] 
    #gender = specificAuthor[3] 
    
    print(specificAuthor)

print(randomSample)

start_year       2010
career_length       3
gender              m
Name: mohammad abd-alrahman mahmoud abushariah, dtype: object
start_year       2011
career_length       1
gender              f
Name: sonia marcela heemstra de groot, dtype: object
start_year       2001
career_length       1
gender           none
Name: eun hee choi, dtype: object
start_year       1993
career_length       3
gender           none
Name: c. k. luk, dtype: object
start_year       2014
career_length       1
gender              m
Name: filippo belardelli, dtype: object
start_year       2008
career_length       1
gender              m
Name: patrick friese, dtype: object
start_year       2009
career_length       1
gender              m
Name: joel aronoff, dtype: object
start_year       2004
career_length      12
gender              m
Name: bahram jalali, dtype: object
start_year       2009
career_length       7
gender           none
Name: ziming zeng, dtype: object
start_year       2014
career_length       2
gend

In [86]:
i = 0

# same year and <= next 2 years
PUBLICATION_RECORDING_YEARS = [0, 1, 2]
CAREER_LENGTH = 15
# same year and the next 2 years
# 15 year is also included
CITATION_RECORDING_YEAR_LIST = [0, 1, 2] 

NO_OF_TOP_CITATIONS = 3

column_names = ['author']
for pub_yr in PUBLICATION_RECORDING_YEARS:
    temp = 'npub_yr'
    column_names.append(temp+str(pub_yr+1))
    for cit_yr in CITATION_RECORDING_YEAR_LIST:
        temp1 = 'ncit_yr'+str(pub_yr+1+cit_yr)+'_pub_yr'+str(pub_yr+1)
        column_names.append(temp1)
    column_names.append('ncit_yr'+str(CAREER_LENGTH)+'_pub_yr'+str(pub_yr+1))

column_names.append('npub_yr'+str(CAREER_LENGTH))
column_names.append('ncit_yr'+str(CAREER_LENGTH)+'_pub_yr'+str(CAREER_LENGTH))
column_names.append('ncit_yr'+str(CAREER_LENGTH)+'_top_'+str(NO_OF_TOP_CITATIONS)+'_pub')

print(column_names)
specificAuthorOutputValuesList = []
# Take 10 random authors and compute all the metrics
randomSample = random.sample(finalData.index, 100)
#randomSample = ['rong-yau huang', 'deborah r. mcdonald']

#for specificAuthorIndex in randomSample:
for specificAuthorIndex in savedData['author'].values:
#for specificAuthorIndex in ['ruben gonzalez']:
    isAllGood = True
    specificAuthor = finalData.loc[specificAuthorIndex]
    authorName = specificAuthorIndex
    startYear = specificAuthor['start_year'] 
    #careerLength = specificAuthor['career_length] 
    gender = specificAuthor['gender'] 
    
    print(authorName)
    tempSavedData = savedData[savedData['author'] == authorName]
    
    
    specificAuthorOutputValues = []
    #'author' name is added
    specificAuthorOutputValues.append(authorName)
    
    # filter the data only for the specific author and consider their career only for specified CAREER_LENGTH
    authorSpecificData = publicationWithCitationData[publicationWithCitationData['author'] == authorName]
    authorSpecificData = authorSpecificData[authorSpecificData['pub_year'] <= (startYear + CAREER_LENGTH - 1)]
    #print(authorSpecificData)
    
    #go through the required career age (PUBLICATION_RECORDING_YEARS) and consider the papers written
    # look for citations for those papers in the required career age (CITATION_RECORDING_YEAR_LIST)
    for prod_year in PUBLICATION_RECORDING_YEARS:
        #tempData = authorSpecificData[authorSpecificData['pub_id'].isin(authorPublications)]
        tempData = authorSpecificData[authorSpecificData['pub_year'] <= \
                                                (startYear + prod_year)]
        authorPublications = tempData['pub_id'].unique()
        # no. of publications untill 'prod_year' year
        specificAuthorOutputValues.append(len(authorPublications))
        
        if(len(authorPublications) != tempSavedData['npub_yr'+str(prod_year + 1)].values[0]):
            print('No. of publications at career age '+str(prod_year + 1)+' is not matching')
            isAllGood = False
            
        #no. of citations untill CAREER_LENGTH - 15th year is also included
        tempCitData = tempData[tempData['cited_year'] <= (startYear + CAREER_LENGTH - 1)]
            
        for citation_year in CITATION_RECORDING_YEAR_LIST:
            temp1Data = tempCitData[tempCitData['cited_year'] <= (startYear + prod_year + citation_year)]
            #print(temp1Data)

            #no. of citations for specified papers untill 'citation_year' year
            #count() - does not count the NaN values - this is important because some publications are not cited by others
            specificAuthorOutputValues.append(temp1Data['cited_year'].count())
 
            
            if(temp1Data['cited_year'].count() != tempSavedData['ncit_yr'+str(prod_year + 1 +citation_year)+'_pub_yr'+str(prod_year + 1)].values[0]):
                print('No. of citations at career age '+str(prod_year + 1 +citation_year)+' for publications at career age '+str(prod_year + 1)+' is not matching')
                isAllGood = False
        
        #print(tempCitData)
        
        specificAuthorOutputValues.append(tempCitData['cited_year'].count())
        if(tempCitData['cited_year'].count() != tempSavedData['ncit_yr'+str(CAREER_LENGTH)+'_pub_yr'+str(prod_year + 1)].values[0]):
            print('No. of citations at career age '+str(CAREER_LENGTH)+' for publications at career age '+str(prod_year + 1)+' is not matching')
            isAllGood = False
            
    
    # no. of papers published untill CAREER_LENGTH - 15 years and the citations for it is included
    specificAuthorOutputValues.append(authorSpecificData['pub_id'].nunique())
    if(authorSpecificData['pub_id'].nunique() != tempSavedData['npub_yr'+str(CAREER_LENGTH)].values[0]):
        print('No. of publications at career age '+str(CAREER_LENGTH)+' is not matching')
        isAllGood = False
            
    tempCitData = authorSpecificData[authorSpecificData['cited_year'] <= (startYear + CAREER_LENGTH - 1)]
    
    specificAuthorOutputValues.append(tempCitData['cited_year'].count())
    if(tempCitData['cited_year'].count() != tempSavedData['ncit_yr'+str(CAREER_LENGTH)+'_pub_yr'+str(CAREER_LENGTH)].values[0]):
        print('No. of citations at career age '+str(CAREER_LENGTH)+' for publications at career age '+str(CAREER_LENGTH)+' is not matching')
        isAllGood = False
            
    
    temp = tempCitData[['author','pub_id','cited_year']].groupby(['author','pub_id']).count().reset_index()
    temp = temp.sort_values(by=['cited_year'], ascending=False).head(NO_OF_TOP_CITATIONS).sum()['cited_year']
    specificAuthorOutputValues.append(temp)

    if(temp != tempSavedData['ncit_yr'+str(CAREER_LENGTH)+'_top_'+str(NO_OF_TOP_CITATIONS)+'_pub'].values[0]):
        print('Sum of citations obtained for top '+str(NO_OF_TOP_CITATIONS)+' publications is not matching')
        isAllGood = False
            
    
    if(not isAllGood):
        print(specificAuthorOutputValues)
        tempList = tempSavedData[column_names].values[0].tolist()
        newList = []
        newList.append(tempList[0])
        print(newList + [int(x) for x in tempList[1:]])
    
    #print(temp)
    #sum of no. of citations for top 'NO_OF_TOP_CITATIONS' - 3 papers are added

    #print(specificAuthorOutputValues)
    specificAuthorOutputValuesList.append(specificAuthorOutputValues)
    #finalData.loc[authorName] = specificAuthorOutputValues
    # only for debugging (to reduce the dataset)
    i = i + 1
    if i%100 == 0:
        print(i)
#     if i >= 10:
#         break
        
# dFrame = pd.DataFrame(columns=column_names, data=specificAuthorOutputValuesList)
# #print(dFrame.head())
# finalData = pd.merge( left=finalData, right=dFrame, how='inner', on=['author'])
print(finalData.head())

#finalData.to_csv('./data/author-career-short-snapshot.csv', index=False)

['author', 'npub_yr1', 'ncit_yr1_pub_yr1', 'ncit_yr2_pub_yr1', 'ncit_yr3_pub_yr1', 'ncit_yr15_pub_yr1', 'npub_yr2', 'ncit_yr2_pub_yr2', 'ncit_yr3_pub_yr2', 'ncit_yr4_pub_yr2', 'ncit_yr15_pub_yr2', 'npub_yr3', 'ncit_yr3_pub_yr3', 'ncit_yr4_pub_yr3', 'ncit_yr5_pub_yr3', 'ncit_yr15_pub_yr3', 'npub_yr15', 'ncit_yr15_pub_yr15', 'ncit_yr15_top_3_pub']
anwar al hamra
chia-ying li
gyoo-yong chae
ruben gonzalez
sachin s. sapatnekar
sean j. taylor
somnath mitra
tobias graml
                       start_year  career_length gender
author                                                 
'maseka lesaoana             2001             15   none
(max) zong-ming cheng        2009              3   none
(zhou) bryan bai             2011              2   none
a lun                        2010              1   none
a min tjoa                   1979             37   none


In [77]:
type(tempSavedData[column_names].values[0].tolist())

list

In [74]:

failed_data = ['anwar al hamra', 'sachin s. sapatnekar', 'sean j. taylor', 'ruben gonzalez', 'gyoo-yong chae', \
               'chia-ying li', 'tobias graml', 'somnath mitra']

failed_data = failed_data[0:1]
#'rong-yau huang', 'deborah r. mcdonald'
failed_data = ['ruben gonzalez']
for author in failed_data:
    print(publicationWithCitationData[publicationWithCitationData['author'] == author])
    print(savedData.columns)
    print(savedData[savedData['author'] == author].values)

                  author  pub_year                    pub_id  cited_year
6272456   ruben gonzalez      2001  53e99aa6b7602d970231ec9d        2009
8011975   ruben gonzalez      2011  53e9a848b7602d970318ef5d         NaN
9811180   ruben gonzalez      2012  53e9ade1b7602d97037e7eb9         NaN
9947403   ruben gonzalez      1998  53e9bad7b7602d9704704216        2001
9947404   ruben gonzalez      1998  53e9bad7b7602d9704704216        2001
9947405   ruben gonzalez      1998  53e9bad7b7602d9704704216        1999
9947406   ruben gonzalez      1998  53e9bad7b7602d9704704216        2002
11354912  ruben gonzalez      2000  53e9ab00b7602d9703483709        2008
11354913  ruben gonzalez      2000  53e9ab00b7602d9703483709        2011
11589923  ruben gonzalez      2005  53e9bcc5b7602d9704948c5a         NaN
12049259  ruben gonzalez      1999  53e9bb7fb7602d97047c2f13         NaN
12659818  ruben gonzalez      2013  53e9b648b7602d97041a72b9         NaN
13451692  ruben gonzalez      2015  55323e1545cec66

In [68]:
anwarData = savedData[savedData['author'] == 'anwar al hamra']
print(anwarData['ncit_yr3_pub_yr3'])
print(anwarData['ncit_yr4_pub_yr3'])
print(anwarData['ncit_yr5_pub_yr3'])

122767    3
Name: ncit_yr3_pub_yr3, dtype: float64
122767    4
Name: ncit_yr4_pub_yr3, dtype: float64
122767    4
Name: ncit_yr5_pub_yr3, dtype: float64


In [62]:
print(publicationWithCitationData[publicationWithCitationData['author'] == 'deborah r. mcdonald'])
print(savedData[savedData['author'] == 'deborah r. mcdonald'].values)

                       author  pub_year                    pub_id  cited_year
18595012  deborah r. mcdonald      1988  53e9ab3db7602d97034cb49d        2010
18595013  deborah r. mcdonald      1988  53e9ab3db7602d97034cb49d        2013
18595014  deborah r. mcdonald      1988  53e9ab3db7602d97034cb49d        1996
Index([u'author', u'start_year', u'career_length', u'gender', u'npub_yr15',
       u'ncit_yr15_pub_yr15', u'ncit_yr15_top_3_pub', u'npub_yr3',
       u'ncit_yr15_pub_yr3', u'ncit_yr5_pub_yr3', u'ncit_yr4_pub_yr3',
       u'ncit_yr3_pub_yr3', u'npub_yr2', u'ncit_yr15_pub_yr2',
       u'ncit_yr4_pub_yr2', u'ncit_yr3_pub_yr2', u'ncit_yr2_pub_yr2',
       u'npub_yr1', u'ncit_yr15_pub_yr1', u'ncit_yr3_pub_yr1',
       u'ncit_yr2_pub_yr1', u'ncit_yr1_pub_yr1'],
      dtype='object')
[['deborah r. mcdonald' 1988 1 'f' 1 1.0 1.0 1 1.0 0.0 0.0 0.0 1 1.0 0.0
  0.0 0.0 1 1.0 0.0 0.0 0.0]]
