In [1]:
from core.search.query_paper_mag import author_paper_mag_multiquery
from get_links                   import *

import pandas as pd
import numpy as np

In [2]:
def link_pandas(links):
    ''' Turns the dictionary into pandas df.
    '''
    res_list = list()
    for paper_id, link_dict in links.items():
        link_dict['PaperId'] = paper_id
        res_list.append(link_dict)

    res_df = pd.DataFrame(res_list)
    res_df.set_index('PaperId', inplace=True)
    return res_df

In [3]:
# Generate the paper ids for an author

author_ids = [2100918400] # Lexing Xie, 137 Papers

# Paper ids
paper_ids = author_paper_mag_multiquery(author_ids)

In [4]:
# Paper information
print('Number of papers:', len(paper_ids))

Number of papers: 137


In [8]:
# Get information via API
citation_expect = author_link_vals(author_ids)

#DictTable(citation_expect)
citation_expect

{2100918400: 4852}

In [6]:
# Get links from graph
graph_links = link_pandas(link_from_graph(paper_ids))
graph_links['CitationCount']  = graph_links.apply(lambda x: len(x['Citations']), axis=1)
graph_links['ReferenceCount'] = graph_links.apply(lambda x: len(x['References']), axis=1)

# Summary of graph links
print(graph_links.describe())
print()
print('Totals')
print(graph_links.agg({'CitationCount': np.sum, 'ReferenceCount': np.sum}))

       CitationCount  ReferenceCount
count     137.000000      137.000000
mean       20.832117       19.204380
std        46.947964       26.841347
min         0.000000        0.000000
25%         0.000000        1.000000
50%         3.000000       10.000000
75%        16.000000       25.000000
max       318.000000      153.000000

Totals
CitationCount     2854
ReferenceCount    2631
dtype: int64


In [30]:
# Get links from graph
eval_links = link_pandas(link_from_evaluate(paper_ids))
eval_links['CitationCount']  = eval_links.apply(lambda x: len(x['Citations']), axis=1)
eval_links['ReferenceCount'] = eval_links.apply(lambda x: len(x['References']), axis=1)

# Summary of graph links
print(eval_links.describe())
print()
print('Totals')
print(eval_links.agg({'CitationCount': np.sum, 'ReferenceCount': np.sum}))

       CitationCount  ReferenceCount
count     137.000000      137.000000
mean       34.627737       22.678832
std        74.321441       27.285510
min         0.000000        0.000000
25%         1.000000        7.000000
50%         9.000000       15.000000
75%        25.000000       28.000000
max       527.000000      156.000000

Totals
CitationCount     4744
ReferenceCount    3107
dtype: int64


In [32]:
# Find difference between data
set_diff  = lambda x,y: list(set(x) - set(y))
get_total = lambda x,y: list((x + y))

link_diff = pd.merge(graph_links, eval_links, on='PaperId', suffixes=('Graph', 'Eval'))
link_diff['UniqueCitationsGraph'] = link_diff.apply(lambda x: set_diff(x['CitationsGraph'], x['CitationsEval']), axis=1)
link_diff['UniqueCitationsEval']  = link_diff.apply(lambda x: set_diff(x['CitationsEval'], x['CitationsGraph']), axis=1)
#link_diff['TotalCitations']       = link_diff.apply(lambda x: get_total(x['CitationsEval'], x['CitationsGraph']), axis=1)

link_diff['UniqueReferencesGraph'] = link_diff.apply(lambda x: set_diff(x['ReferencesGraph'], x['ReferencesEval']), axis=1)
link_diff['UniqueReferencesEval']  = link_diff.apply(lambda x: set_diff(x['ReferencesEval'], x['ReferencesGraph']), axis=1)
#link_diff['TotalReferences']       = link_diff.apply(lambda x: get_total(x['ReferencesEval'], x['ReferencesGraph']), axis=1)

cols = ['UniqueCitationsEval', 'UniqueCitationsGraph',# 'TotalCitations',
        'UniqueReferencesEval', 'UniqueReferencesGraph',]# 'TotalReferences']
link_diff = link_diff[cols]

# Aggregate prep
agg_list = dict()
for col in cols:
    link_diff[col+'Count']  = link_diff.apply(lambda x: len(x[col]), axis=1)
    agg_list[col+'Count'] = np.sum

# Summary of link difference
print(link_diff.describe())
print()
print('Totals')
print(link_diff.agg(agg_list))

       UniqueCitationsEvalCount  UniqueCitationsGraphCount  \
count                137.000000                 137.000000   
mean                   4.226277                   0.211679   
std                    9.795702                   0.919004   
min                    0.000000                   0.000000   
25%                    0.000000                   0.000000   
50%                    1.000000                   0.000000   
75%                    4.000000                   0.000000   
max                   70.000000                   8.000000   

       UniqueReferencesEvalCount  UniqueReferencesGraphCount  
count                 137.000000                  137.000000  
mean                    3.832117                    0.357664  
std                     9.470985                    2.024618  
min                     0.000000                    0.000000  
25%                     0.000000                    0.000000  
50%                     0.000000                    0.000000  
