# CZ4071 Data Collection and Processing
##### Author: Phoe Chuan Bin

### 0. Import dependencies and set filepaths

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [7]:
input_filepath = './Input/DataScientists.xls'
raw_output_filepath = './Input/RawNetworkDataFrame.csv'
processed_output_filepath = './Input/ProcessedNetworkDataFrame.csv'
host_address = 'https://dblp.org/pid/'

### 1. Read input file

In [3]:
input_file = pd.read_excel(input_filepath)
link_addresses = input_file['dblp']

### 2. Extract Data
For each http address in the dblp column:
1. Get the PID from the redirected address
2. Skip if seen this PID before (prevent duplicates), else update seen list (pid_list) and name mapping (pid_name_mapping)
3. Use bs4 to extract xml
4. Skip if this PID is non-existent (prevent error if no such records)
5. Get all articles and extract ['author_pid','coauthor_pid','year','title']
6. Add to dataframe network_df

In [4]:
pid_list = []
network_df = pd.DataFrame(columns=['author_pid','coauthor_pid','year','title'])
pid_name_mapping = {}

In [5]:
i = 0
print("Total link addresses:", len(link_addresses))
for link_address in link_addresses:
    r = requests.get(link_address) 
    pid = '/'.join(r.url.split('/')[-2:]).replace('.html','')
    
    # Do not process repeated PID, error checking for duplicate reocrds
    if pid in pid_list:
        print("Skipping repeated PID", pid)
        i+=1
        continue
    pid_list.append(pid)
#     print("pid:", pid)
    
    # Extract data using beautiful soup
    url = host_address + pid + '.xml'
#     print("url:", url)
    xml_data = requests.get(url).content
    soup = BeautifulSoup(xml_data, "xml")
    try:
        # Update pid_name_mapping
        name = soup.find('dblpperson')['name']
        pid_name_mapping[pid] = name
    except:
        print("Skipping invalid PID", pid, ". No such person record.")
        i+=1
        continue
    # Obtain array of articles
    articles = soup.find_all('r')
    
#     print("No. articles:", len(articles))
#     print("Co-authors:")
    # Extract 'year', 'coauthor_pid' and append 'author_pid','coauthor_pid','year' to network_df
    for article in articles:
        article = article.contents[0]
        article_authors = article.find_all('author')
        article_year = article.find('year').string
        article_title = article.find('title').string
        for article_author in article_authors:
            coauthor_pid = article_author['pid']
            if coauthor_pid == pid:
                continue
#             print(coauthor_pid)
            row_data_dict = {'author_pid': pid, 'coauthor_pid': coauthor_pid, 'year': article_year, 'title': article_title}
            network_df = network_df.append(row_data_dict, ignore_index = True)
#         print("Year:", article_year)
#         print()
#     print("-------------------------------------")
    i+=1
    print("Processed link address", i)

Total link addresses: 1220
Processed link address 1
Processed link address 2
Processed link address 3
Processed link address 4
Processed link address 5
Processed link address 6
Processed link address 7
Processed link address 8
Skipping repeated PID 04/7892
Processed link address 10
Processed link address 11
Processed link address 12
Processed link address 13
Processed link address 14
Processed link address 15
Processed link address 16
Skipping repeated PID k/AlfonsKemper
Processed link address 18
Processed link address 19
Processed link address 20
Processed link address 21
Processed link address 22
Processed link address 23
Processed link address 24
Processed link address 25
Skipping repeated PID h/AlonYHalevy
Processed link address 27
Processed link address 28
Processed link address 29
Processed link address 30
Processed link address 31
Skipping repeated PID 136/7882
Processed link address 33
Processed link address 34
Processed link address 35
Processed link address 36
Processed link 

Processed link address 299
Processed link address 300
Processed link address 301
Processed link address 302
Processed link address 303
Processed link address 304
Processed link address 305
Processed link address 306
Processed link address 307
Processed link address 308
Processed link address 309
Processed link address 310
Processed link address 311
Processed link address 312
Processed link address 313
Skipping repeated PID f/GeorgeHLFletcher
Processed link address 315
Processed link address 316
Skipping repeated PID 32/1624
Processed link address 318
Processed link address 319
Skipping repeated PID k/GeorgiaKoutrika
Skipping repeated PID k/GeorgiaKoutrika
Processed link address 322
Processed link address 323
Processed link address 324
Processed link address 325
Processed link address 326
Processed link address 327
Processed link address 328
Skipping repeated PID 151/8293
Processed link address 330
Processed link address 331
Processed link address 332
Processed link address 333
Processe

Processed link address 595
Processed link address 596
Processed link address 597
Processed link address 598
Processed link address 599
Processed link address 600
Processed link address 601
Processed link address 602
Processed link address 603
Processed link address 604
Processed link address 605
Processed link address 606
Processed link address 607
Processed link address 608
Processed link address 609
Processed link address 610
Processed link address 611
Processed link address 612
Processed link address 613
Skipping repeated PID t/MartinTheobald
Processed link address 615
Processed link address 616
Processed link address 617
Processed link address 618
Processed link address 619
Processed link address 620
Skipping repeated PID 49/10034
Processed link address 622
Processed link address 623
Processed link address 624
Processed link address 625
Skipping repeated PID 75/887
Skipping repeated PID 75/887
Processed link address 628
Processed link address 629
Skipping repeated PID r/MatthiasRen

Processed link address 891
Processed link address 892
Processed link address 893
Processed link address 894
Processed link address 895
Processed link address 896
Processed link address 897
Processed link address 898
Processed link address 899
Skipping repeated PID 11/7555
Processed link address 901
Processed link address 902
Processed link address 903
Processed link address 904
Processed link address 905
Processed link address 906
Processed link address 907
Processed link address 908
Processed link address 909
Skipping repeated PID 57/10514
Processed link address 911
Processed link address 912
Processed link address 913
Processed link address 914
Processed link address 915
Processed link address 916
Processed link address 917
Processed link address 918
Processed link address 919
Processed link address 920
Processed link address 921
Processed link address 922
Skipping repeated PID 83/2389-1
Processed link address 924
Processed link address 925
Processed link address 926
Processed link a

Processed link address 1181
Processed link address 1182
Processed link address 1183
Processed link address 1184
Skipping repeated PID 160/1053
Skipping repeated PID 160/1053
Processed link address 1187
Processed link address 1188
Skipping repeated PID t/YufeiTao
Processed link address 1190
Processed link address 1191
Skipping repeated PID 172/5536
Processed link address 1193
Processed link address 1194
Processed link address 1195
Processed link address 1196
Processed link address 1197
Processed link address 1198
Processed link address 1199
Processed link address 1200
Skipping repeated PID 117/3757
Processed link address 1202
Processed link address 1203
Processed link address 1204
Processed link address 1205
Processed link address 1206
Processed link address 1207
Processed link address 1208
Processed link address 1209
Processed link address 1210
Skipping repeated PID 169/3419
Processed link address 1212
Processed link address 1213
Processed link address 1214
Processed link address 1215


### 3. Save extracted raw data

In [9]:
network_df

Unnamed: 0,author_pid,coauthor_pid,year,title
0,75/9436,147/1343,2021,Decomposed Bounded Floats for Fast Compression...
1,75/9436,38/6049,2021,Decomposed Bounded Floats for Fast Compression...
2,75/9436,163/0545,2021,Decomposed Bounded Floats for Fast Compression...
3,75/9436,05/7063,2021,A Demonstration of Relic: A System for REtrosp...
4,75/9436,131/4166,2021,A Demonstration of Relic: A System for REtrosp...
...,...,...,...,...
362947,146/6971,188/6434,2016,RadioHound: A Pervasive Sensing Network for Su...
362948,146/6971,48/450-7,2013,Transmission capacity of device-to-device comm...
362949,146/6971,85/5485,2013,Transmission capacity of device-to-device comm...
362950,146/6971,89/6609-1,2013,Transmission capacity of device-to-device comm...


In [8]:
# Save raw
network_df.to_csv(raw_output_filepath, index=False)

### 4. Process raw data
- Remove individuals not in the network using seen list (pid_list)
- Add author_name and coauthor_name columns using name mapping (pid_name_mapping)
- Remove duplicates since edges are bidirectional, remove if row data is same when author and coauthor is swapped

In [10]:
pid_list

['75/9436',
 '162/9092',
 '127/6195',
 '121/4198',
 '116/1678',
 '39/1380',
 '139/0855',
 '04/7892',
 'j/AdamJatowt',
 '62/1373',
 '21/8510',
 '04/1835',
 'l/ALabrinidis',
 '92/2820',
 'k/AlfonsKemper',
 '159/6520',
 'd/AlinDeutsch',
 '134/9031',
 '164/1401',
 'p/NeoklisPolyzotis',
 's/ASimitsis',
 '163/3911',
 'h/AlonYHalevy',
 's/ASdaSilva',
 '16/4915',
 's/AmbujKSingh',
 '131/6619',
 '136/7882',
 '67/2583',
 '77/5034',
 '75/5002',
 'd/AmolDeshpande',
 'a/AmrElAbbadi',
 '05/8073-1',
 '160/4306',
 '127/1469',
 '37/10133-3',
 '26/2579',
 'a/AnastassiaAilamaki',
 'c/AndreaCali',
 'h/AndreasHotho',
 '167/6469',
 '38/2946',
 '18/2478',
 '222/3221',
 '147/4971',
 '58/4127',
 '218/6513',
 'b/ABonifati',
 'd/AnHaiDoan',
 'm/AnimeshMukherjee',
 '73/2286',
 '54/385-1',
 '81/10578',
 '29/11192',
 's/AnnaCinziaSquicciarini',
 '63/5512',
 '63/9828',
 '147/1241',
 't/AnthonyKHTung',
 '96/11411',
 'd/ADeligiannakis',
 '163/0449',
 'z/AoyingZhou',
 '183/5034',
 '20/3559',
 'p/ANPapadopoulos',
 't/Ar

In [11]:
pid_name_mapping

{'75/9436': 'Aaron J. Elmore',
 '162/9092': 'Abdalghani Abujabal',
 '127/6195': 'Abdul Quamar',
 '121/4198': 'Abdulhakim Ali Qahtan',
 '116/1678': 'Abhijnan Chakraborty',
 '39/1380': 'Abhishek Ghose',
 '139/0855': 'Abhishek Kumar Singh',
 '04/7892': 'Abolfazl Asudeh',
 'j/AdamJatowt': 'Adam Jatowt',
 '62/1373': 'Adam J. Lee',
 '21/8510': 'Alexander Thomson',
 '04/1835': 'Alexandra Meliou',
 'l/ALabrinidis': 'Alexandros Labrinidis',
 '92/2820': 'Alexandros Ntoulas',
 'k/AlfonsKemper': 'Alfons Kemper',
 '159/6520': 'Ali Ahmadvand',
 'd/AlinDeutsch': 'Alin Deutsch',
 '134/9031': 'Aline Bessa',
 '164/1401': 'Alireza Heidari',
 'p/NeoklisPolyzotis': 'Neoklis Polyzotis',
 's/ASimitsis': 'Alkis Simitsis',
 '163/3911': 'Allison Holloway',
 'h/AlonYHalevy': 'Alon Y. Halevy',
 's/ASdaSilva': 'Altigran Soares da Silva',
 '16/4915': 'Alvin Cheung',
 's/AmbujKSingh': 'Ambuj K. Singh',
 '131/6619': 'Amelie Chi Zhou',
 '136/7882': 'Amit Awekar',
 '67/2583': 'Amit Gupta',
 '77/5034': 'Amitabha Bagchi'

In [12]:
# Remove individuals not in the network
network_df = network_df[network_df['author_pid'].isin(pid_list)]
network_df = network_df[network_df['coauthor_pid'].isin(pid_list)]
network_df

Unnamed: 0,author_pid,coauthor_pid,year,title
2,75/9436,163/0545,2021,Decomposed Bounded Floats for Fast Compression...
5,75/9436,163/0545,2021,VergeDB: A Database for IoT Analytics on Edge ...
10,75/9436,f/MJFranklin,2021,VergeDB: A Database for IoT Analytics on Edge ...
11,75/9436,147/1189,2021,VergeDB: A Database for IoT Analytics on Edge ...
13,75/9436,117/3757,2021,Version Reconciliation for Collaborative Datab...
...,...,...,...,...
362916,146/6971,w/YangWang15,2019,sharedCharging: Data-Driven Shared Charging fo...
362920,146/6971,86/1747,2019,Rhetorically Controlled Encoder-Decoder for Mo...
362926,146/6971,86/1747,2019,Reinforcement Knowledge Graph Reasoning for Ex...
362935,146/6971,86/1747,2019,OOGAN: Disentangling GAN with One-Hot Sampling...


In [22]:
network_df['author_name'] = network_df.apply(lambda d: pid_name_mapping[d['author_pid']], axis=1)
network_df['coauthor_name'] = network_df.apply(lambda d: pid_name_mapping[d['coauthor_pid']], axis=1)

In [34]:
network_df['check_duplicates'] = network_df.apply(lambda d: ''.join(sorted([str(d['author_pid']),
                                                                            str(d['coauthor_pid']),
                                                                            str(d['title']),
                                                                            str(d['year'])
                                                                           ])), axis=1)
network_df

Unnamed: 0,author_pid,coauthor_pid,year,title,author_name,coauthor_name,check_duplicates
2,75/9436,163/0545,2021,Decomposed Bounded Floats for Fast Compression...,Aaron J. Elmore,John Paparrizos,163/0545202175/9436Decomposed Bounded Floats f...
5,75/9436,163/0545,2021,VergeDB: A Database for IoT Analytics on Edge ...,Aaron J. Elmore,John Paparrizos,163/0545202175/9436VergeDB: A Database for IoT...
10,75/9436,f/MJFranklin,2021,VergeDB: A Database for IoT Analytics on Edge ...,Aaron J. Elmore,Michael J. Franklin,202175/9436VergeDB: A Database for IoT Analyti...
11,75/9436,147/1189,2021,VergeDB: A Database for IoT Analytics on Edge ...,Aaron J. Elmore,Sanjay Krishnan,147/1189202175/9436VergeDB: A Database for IoT...
13,75/9436,117/3757,2021,Version Reconciliation for Collaborative Datab...,Aaron J. Elmore,Zechao Shang,117/3757202175/9436Version Reconciliation for ...
...,...,...,...,...,...,...,...
362916,146/6971,w/YangWang15,2019,sharedCharging: Data-Driven Shared Charging fo...,Zuohui Fu,Yang Wang 0015,146/69712019sharedCharging: Data-Driven Shared...
362920,146/6971,86/1747,2019,Rhetorically Controlled Encoder-Decoder for Mo...,Zuohui Fu,Gerard de Melo,146/6971201986/1747Rhetorically Controlled Enc...
362926,146/6971,86/1747,2019,Reinforcement Knowledge Graph Reasoning for Ex...,Zuohui Fu,Gerard de Melo,146/6971201986/1747Reinforcement Knowledge Gra...
362935,146/6971,86/1747,2019,OOGAN: Disentangling GAN with One-Hot Sampling...,Zuohui Fu,Gerard de Melo,146/6971201986/1747OOGAN: Disentangling GAN wi...


In [36]:
network_df.drop_duplicates('check_duplicates', inplace=True)
network_df = network_df.drop('check_duplicates', 1)
network_df

Unnamed: 0,author_pid,coauthor_pid,year,title,author_name,coauthor_name
2,75/9436,163/0545,2021,Decomposed Bounded Floats for Fast Compression...,Aaron J. Elmore,John Paparrizos
5,75/9436,163/0545,2021,VergeDB: A Database for IoT Analytics on Edge ...,Aaron J. Elmore,John Paparrizos
10,75/9436,f/MJFranklin,2021,VergeDB: A Database for IoT Analytics on Edge ...,Aaron J. Elmore,Michael J. Franklin
11,75/9436,147/1189,2021,VergeDB: A Database for IoT Analytics on Edge ...,Aaron J. Elmore,Sanjay Krishnan
13,75/9436,117/3757,2021,Version Reconciliation for Collaborative Datab...,Aaron J. Elmore,Zechao Shang
...,...,...,...,...,...,...
358563,189/2405,20/3716,2019,Balance-Aware Distributed String Similarity-Ba...,Zeyuan Shang,Zhifeng Bao
358588,189/2405,20/3716,2018,DITA: Distributed In-Memory Trajectory Analytics.,Zeyuan Shang,Zhifeng Bao
358590,189/2405,20/3716,2018,DITA: A Distributed In-Memory Trajectory Analy...,Zeyuan Shang,Zhifeng Bao
358594,189/2405,20/3716,2017,Dima: A Distributed In-Memory Similarity-Based...,Zeyuan Shang,Zhifeng Bao


### 5. Save processed data

In [37]:
# Save processed
network_df.to_csv(processed_output_filepath, index=False)