## Crawling all NTU Researchers
- Just in case information from other faculties is required in later stages, all 1441 researchers from ntu were crawled with scrapy spider
- Using a crawler is also much faster
- output saved to researchers.json


In [1]:
import scrapy
import re
class CrawlResearchersSpider(scrapy.Spider):
    name = 'crawl_researchers'
    allowed_domains = ['dr.ntu.edu.sg']
    
    # start_urls = ["https://dr.ntu.edu.sg/simple-search?query=&location=researcherprofiles&crisID=&relationName=&sort_by=score&order=desc&rpp=50&etal=0&start=0"]
    start_urls = ["https://dr.ntu.edu.sg/simple-search?location=researcherprofiles"]
    
    def parse(self, response):
    
        # parse the all names and emails, then go to each profile
        domain = "https://dr.ntu.edu.sg"
        researcher_rows = response.css("table.table.table-hover td")
        names = researcher_rows.css("td[headers=t1] a::text").getall()
        emails = researcher_rows.css("td[headers=t3]::text").getall()
        endpoints = researcher_rows.css("td a::attr(href)").getall()
        for i, endpoint in enumerate(endpoints):
            dr_ntu_profile_url = domain+endpoint
            meta = {
                "full_name":names[i],
                "email":emails[i],
                "dr_ntu_url": dr_ntu_profile_url
            }
            yield response.follow(dr_ntu_profile_url, callback = self.parse_profile, meta = meta)
            
        # check if there's next page then go to next page
        pagination = response.css("ul.pagination.pull-right")
        has_next = "next" in pagination.css("li a[href] ::text").getall()
        if has_next:
            next_endpoint = pagination.css("li a::attr(href)").getall()[-1]
            next_link = domain+next_endpoint
            yield response.follow(next_link, callback=self.parse)
        
    # parse the schools and personal sites, then save following data: full_name, email, dr_ntu_url, schools, personal_site
    def parse_profile(self, response):
        output = {
            "full_name": response.meta["full_name"],
            "email":response.meta["email"],
            "dr_ntu_url": response.meta["dr_ntu_url"]
        }
        info = response.css("div[id=researcherInfo]")
        schools = [self.clean_school(string) for string in info.css("div::text").getall() if re.search('[a-zA-Z]', string)]
        output["schools"] = schools
        output["personal_site"] = info.css("div[id=personalsiteDiv] a::attr(href)").get()
        yield output
        
        
    # remove extra spaces, \n and \t
    def clean_school(self, string):
        string = re.sub(r"[\n\t]*", "", string)
        string = re.sub(' +', ' ', string)
        return string.strip()

## Extracting SCSE Researchers
- Used fuzzy string matching to acquire 82 scse researchers from 1441 ntu researchers
- although a simple solution could already suffice here, this proposed solution will be much more robust when the data becomes very dirty

In [2]:
import json
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4125 - Developing Data Products\Assignments\Individual Assignment\Part 1\researchers\researchers.json"
with open(path) as f:
    data = json.load(f)
    data = pd.DataFrame(data)

In [5]:
# this scorer combines multiple scorers from fuzzwuzzy st "School of Computer Science and Engineering" can be matched with
# 1. "John, School  of Computer    Science   and  Engineering"
# 2. "John, School  of of Computer    Science   and  Engineering (SCSE)"
# 3. "John, School  of Computer    Engineering and Computer Science"
# Benefits: 
# minimal processing is required while still maintaining performance. 
# The problem with applying to much processing is that there are a lot of edge cases to account for, which is not feasible when the data size is very large
# this method is generalizable to other schools like "School of Physical & Mathematical Sciences" or even other strings
# will be advantageous when the data starts to become very dirty 
# Note: Actually a simple "Computer Science" in s2 will suffice in this problem because the data is still quite clean
def score(s1, s2):
    scores = [
        fuzz.token_sort_ratio(s1, s2), 
        fuzz.token_set_ratio(s1, s2), 
        fuzz.partial_ratio(s1, s2)
    ]
    return max(scores)

# return True if target_string is found in choices using the scorer
# Given choices ["Associate Chair (Faculty), School of Computer Science and Engineering (SCSE)", 'Associate Chair (Faculty), CCEB', 'Professor, School of Chemistry, Chemical Engineering and Biotechnology'
# "School of Computer Science and Engineering" will output True 
def fuzzy_in_list(target_string, choices, scorer = score, score_cutoff = 100):
    # this function by default, does the essential processing e.g leading, trailing spaces, non-letters, cases
    result = process.extractOne(target_string, choices, scorer = scorer, score_cutoff = score_cutoff)
    return result is not None

In [6]:
# indeed there are 82 researchers in SCSE (shown in website)
target_school = "School of Computer Science and Engineering"
in_scse = data.schools.apply(lambda choices: fuzzy_in_list(target_school, choices))
scse = data[in_scse]
scse

Unnamed: 0,full_name,email,dr_ntu_url,schools,personal_site
19,Lin Guosheng,gslin@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00483,"[Assistant Professor, School of Computer Scien...",https://guosheng.github.io/
22,Yu Han,han.yu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00334,"[Nanyang Assistant Professor, School of Comput...",http://hanyu.sg
26,Liu Weichen,liu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00280,"[Nanyang Assistant Professor, School of Comput...",https://personal.ntu.edu.sg/liu/
41,Lam Siew Kei,assklam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00901,"[Associate Professor, School of Computer Scien...",https://personal.ntu.edu.sg/assklam/
46,Loke Yuan Ren,yrloke@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00585,"[Lecturer, School of Computer Science and Engi...",https://personal.ntu.edu.sg/yrloke/
...,...,...,...,...,...
1346,Long Cheng,c.long@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00772,"[Assistant Professor, School of Computer Scien...",https://personal.ntu.edu.sg/c.long
1368,Luo Siqiang （骆思强）,siqiang.luo@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01311,"[Assistant Professor, School of Computer Scien...",https://siqiangluo.wixsite.com/homepage
1391,Tang Xueyan,asxytang@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00075,"[Associate Professor, School of Computer Scien...",https://personal.ntu.edu.sg/asxytang/
1407,Erik Cambria,cambria@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00927,"[Associate Professor, School of Computer Scien...",https://sentic.net


In [7]:
# import pickle
# path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4125 - Developing Data Products\Assignments\Individual Assignment\Part 1\researchers_names.pkl"
# with open(path, "wb") as f:
#     pickle.dump(data[in_scse].full_name.tolist(), f)

In [8]:
scse.personal_site[scse.personal_site.apply(lambda site: site is None)] = pd.NA
names = scse.full_name.tolist()

## dblp links
- subsequent sections no longer used crawler because it seems like the sites restricted too much crawling, thus i had to settle with beautifulsoup which is much slower
- steps:
    - perform search with the name
        - if there are no results try another variation of the name
    - if there is one result only, i assume its correct and return
    - if there are multiple
        - return the first one that matches "Nanyang Technological University"
        - if no match return the first result

In [9]:
import requests
import bs4
from bs4 import BeautifulSoup
from tqdm import tqdm

In [10]:
def get_dblp_link(name):
    soup  = get_soup(name)
    authors = soup.find("div", {"id":"completesearch-authors"}).find_all("li", {"itemtype":"http://schema.org/Person"})
    if len(authors) == 0:
        # if no results clean the name, try again
        soup  = get_soup(ugly_clean(name))
        authors = soup.find("div", {"id":"completesearch-authors"}).find_all("li", {"itemtype":"http://schema.org/Person"})
    # save the first link as the most likely solution
    if len(authors) >= 1:
        first_link = authors[0].find("a")["href"]
    # if there's only one option, return link
    if len(authors) == 1:
        return first_link
    # if there are multiple, check if its ntu, then return link 
    for a in authors:
        description = a.find("small")
        if description is not None and "Nanyang Technological University" in description.text:
            link = a.find("a")["href"]
            return link
    # if nothing fits, return first link
    return first_link

def get_soup(name):
    url = get_dblp_query_url(name)
    r = requests.get(url)
    soup  = BeautifulSoup(r.text, "html.parser")
    return soup
def get_dblp_query_url(name):
    return "https://dblp.org/search?q="+"%20".join(name.strip().split())
def ugly_clean(name):
    if "," in name:
        return name.split(",")[0]
    words = name.split()
    words[1] = words[1][0]
    return " ".join(words)

In [11]:
dblp_links = []
for name in tqdm(names):
    print(name)
    link = get_dblp_link(name)
    dblp_links.append(link)

  0%|                                                                                           | 0/82 [00:00<?, ?it/s]

Lin Guosheng


  1%|█                                                                                  | 1/82 [00:00<01:08,  1.18it/s]

Yu Han


  2%|██                                                                                 | 2/82 [00:02<01:24,  1.06s/it]

Liu Weichen


  4%|███                                                                                | 3/82 [00:05<02:33,  1.94s/it]

Lam Siew Kei


  5%|████                                                                               | 4/82 [00:06<02:03,  1.58s/it]

Loke Yuan Ren


  6%|█████                                                                              | 5/82 [00:06<01:37,  1.26s/it]

Lam Kwok Yan


  7%|██████                                                                             | 6/82 [00:08<01:35,  1.26s/it]

Guan Cuntai


  9%|███████                                                                            | 7/82 [00:09<01:39,  1.33s/it]

Anwitaman Datta


 10%|████████                                                                           | 8/82 [00:10<01:28,  1.20s/it]

Lin Weisi


 11%|█████████                                                                          | 9/82 [00:11<01:22,  1.13s/it]

Liu Ziwei


 12%|██████████                                                                        | 10/82 [00:12<01:19,  1.11s/it]

Chee Wei Tan


 13%|███████████                                                                       | 11/82 [00:13<01:26,  1.22s/it]

Fan Xiuyi


 15%|████████████                                                                      | 12/82 [00:14<01:18,  1.12s/it]

Anupam Chattopadhyay


 16%|█████████████                                                                     | 13/82 [00:15<01:15,  1.09s/it]

Zhao Jun


 17%|██████████████                                                                    | 14/82 [00:16<01:11,  1.06s/it]

Quek Hiok Chai


 18%|███████████████                                                                   | 15/82 [00:18<01:14,  1.11s/it]

Sun Aixin


 20%|████████████████                                                                  | 16/82 [00:19<01:13,  1.11s/it]

A S Madhukumar


 21%|█████████████████                                                                 | 17/82 [00:20<01:25,  1.32s/it]

Deepu Rajan


 22%|██████████████████                                                                | 18/82 [00:21<01:17,  1.21s/it]

Zheng Jianmin


 23%|███████████████████                                                               | 19/82 [00:22<01:12,  1.14s/it]

Lin Shang-Wei


 24%|████████████████████                                                              | 20/82 [00:23<01:08,  1.10s/it]

Wentong Cai


 26%|█████████████████████                                                             | 21/82 [00:24<01:03,  1.04s/it]

Li Yi


 27%|██████████████████████                                                            | 22/82 [00:25<01:05,  1.08s/it]

Chen Change Loy


 28%|███████████████████████                                                           | 23/82 [00:26<01:02,  1.05s/it]

Cham Tat Jen


 29%|████████████████████████                                                          | 24/82 [00:27<00:59,  1.03s/it]

Kwoh Chee Keong


 30%|█████████████████████████                                                         | 25/82 [00:28<00:59,  1.04s/it]

Smitha K G


 32%|██████████████████████████                                                        | 26/82 [00:29<00:54,  1.03it/s]

Tan Rui


 33%|███████████████████████████                                                       | 27/82 [00:30<00:53,  1.03it/s]

Cong Gao


 34%|████████████████████████████                                                      | 28/82 [00:31<00:53,  1.00it/s]

Seah Hock Soon


 35%|█████████████████████████████                                                     | 29/82 [00:32<00:52,  1.02it/s]

Luo Jun


 37%|██████████████████████████████                                                    | 30/82 [00:33<00:50,  1.02it/s]

Shen Zhiqi


 38%|███████████████████████████████                                                   | 31/82 [00:34<00:49,  1.02it/s]

He Ying


 39%|████████████████████████████████                                                  | 32/82 [00:35<00:48,  1.03it/s]

Sourav Saha Bhowmick


 40%|█████████████████████████████████                                                 | 33/82 [00:37<00:56,  1.16s/it]

Alexei Sourin


 41%|██████████████████████████████████                                                | 34/82 [00:38<00:52,  1.09s/it]

Miao Chun Yan


 43%|███████████████████████████████████                                               | 35/82 [00:39<00:49,  1.05s/it]

Yeo Chai Kiat


 44%|████████████████████████████████████                                              | 36/82 [00:40<00:46,  1.02s/it]

Chng Eng Siong


 45%|█████████████████████████████████████                                             | 37/82 [00:41<00:44,  1.01it/s]

Douglas Leslie Maskell


 46%|██████████████████████████████████████                                            | 38/82 [00:42<00:44,  1.02s/it]

Lin Feng


 48%|███████████████████████████████████████                                           | 39/82 [00:43<00:45,  1.06s/it]

Huang Shell Ying


 49%|████████████████████████████████████████                                          | 40/82 [00:44<00:45,  1.09s/it]

Arvind Easwaran


 50%|█████████████████████████████████████████                                         | 41/82 [00:45<00:42,  1.03s/it]

Chan Syin


 51%|██████████████████████████████████████████                                        | 42/82 [00:46<00:39,  1.02it/s]

Zhang Hanwang


 52%|███████████████████████████████████████████                                       | 43/82 [00:47<00:39,  1.02s/it]

Ong Yew Soon


 54%|████████████████████████████████████████████                                      | 44/82 [00:48<00:37,  1.02it/s]

Li Fang


 55%|█████████████████████████████████████████████                                     | 45/82 [00:49<00:40,  1.08s/it]

Tay Kian Boon


 56%|██████████████████████████████████████████████                                    | 46/82 [00:51<00:44,  1.24s/it]

Vanessa Evers


 57%|██████████████████████████████████████████████▉                                   | 47/82 [00:52<00:40,  1.16s/it]

Zhang Tianwei


 59%|████████████████████████████████████████████████                                  | 48/82 [00:53<00:37,  1.10s/it]

Li Mo


 60%|█████████████████████████████████████████████████                                 | 49/82 [00:54<00:36,  1.11s/it]

Ke Yiping, Kelly


 61%|██████████████████████████████████████████████████                                | 50/82 [00:56<00:43,  1.35s/it]

Zhang Jie


 62%|███████████████████████████████████████████████████                               | 51/82 [00:57<00:38,  1.25s/it]

Wen Yonggang


 63%|████████████████████████████████████████████████████                              | 52/82 [00:58<00:39,  1.30s/it]

Chia Liang Tien


 65%|█████████████████████████████████████████████████████                             | 53/82 [00:59<00:34,  1.20s/it]

Dusit Niyato


 66%|██████████████████████████████████████████████████████                            | 54/82 [01:00<00:33,  1.20s/it]

Lau Chiew Tong


 67%|███████████████████████████████████████████████████████                           | 55/82 [01:02<00:33,  1.23s/it]

Goh Wooi Boon


 68%|████████████████████████████████████████████████████████                          | 56/82 [01:02<00:29,  1.13s/it]

Hui Siu Cheung


 70%|████████████████████████████████████████████████████████▉                         | 57/82 [01:03<00:27,  1.09s/it]

Thambipillai Srikanthan


 71%|██████████████████████████████████████████████████████████                        | 58/82 [01:04<00:25,  1.05s/it]

Wee Keong NG


 72%|███████████████████████████████████████████████████████████                       | 59/82 [01:05<00:23,  1.04s/it]

Jagath Chandana Rajapakse


 73%|████████████████████████████████████████████████████████████                      | 60/82 [01:07<00:26,  1.21s/it]

Bo An


 74%|█████████████████████████████████████████████████████████████                     | 61/82 [01:08<00:24,  1.15s/it]

Pan, Sinno Jialin


 76%|██████████████████████████████████████████████████████████████                    | 62/82 [01:09<00:23,  1.16s/it]

Mohamed M. Sabry


 77%|███████████████████████████████████████████████████████████████                   | 63/82 [01:11<00:26,  1.42s/it]

Joty Shafiq Rayhan


 78%|████████████████████████████████████████████████████████████████                  | 64/82 [01:12<00:22,  1.26s/it]

Lu Shijian


 79%|█████████████████████████████████████████████████████████████████                 | 65/82 [01:13<00:20,  1.18s/it]

Oh Hong Lye


 80%|██████████████████████████████████████████████████████████████████                | 66/82 [01:14<00:16,  1.01s/it]

Owen Noel Newton Fernando


 82%|███████████████████████████████████████████████████████████████████               | 67/82 [01:15<00:15,  1.06s/it]

Lana Obraztsova


 83%|████████████████████████████████████████████████████████████████████              | 68/82 [01:17<00:17,  1.24s/it]

Zinovi Rabinovich


 84%|█████████████████████████████████████████████████████████████████████             | 69/82 [01:18<00:16,  1.28s/it]

Luke Ong （翁之昊）


 85%|██████████████████████████████████████████████████████████████████████            | 70/82 [01:20<00:16,  1.37s/it]

Vidya Sudarshan


 87%|███████████████████████████████████████████████████████████████████████           | 71/82 [01:21<00:14,  1.29s/it]

Luu Anh Tuan


 88%|████████████████████████████████████████████████████████████████████████          | 72/82 [01:23<00:14,  1.48s/it]

Li Boyang


 89%|█████████████████████████████████████████████████████████████████████████         | 73/82 [01:26<00:18,  2.01s/it]

Qian Kemao


 90%|██████████████████████████████████████████████████████████████████████████        | 74/82 [01:27<00:14,  1.81s/it]

Vun Chan Hua, Nicholas


 91%|███████████████████████████████████████████████████████████████████████████       | 75/82 [01:28<00:10,  1.50s/it]

Lee Bu Sung


 93%|████████████████████████████████████████████████████████████████████████████      | 76/82 [01:29<00:08,  1.34s/it]

Wai Kin Adams Kong


 94%|█████████████████████████████████████████████████████████████████████████████     | 77/82 [01:30<00:06,  1.21s/it]

Long Cheng


 95%|██████████████████████████████████████████████████████████████████████████████    | 78/82 [01:31<00:04,  1.14s/it]

Luo Siqiang （骆思强）


 96%|███████████████████████████████████████████████████████████████████████████████   | 79/82 [01:32<00:03,  1.11s/it]

Tang Xueyan


 98%|████████████████████████████████████████████████████████████████████████████████  | 80/82 [01:33<00:02,  1.07s/it]

Erik Cambria


 99%|█████████████████████████████████████████████████████████████████████████████████ | 81/82 [01:34<00:01,  1.25s/it]

Liu Yang


100%|██████████████████████████████████████████████████████████████████████████████████| 82/82 [01:35<00:00,  1.17s/it]


In [12]:
scse["dblp_links"] = dblp_links

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scse["dblp_links"] = dblp_links


In [14]:
scse

Unnamed: 0,full_name,email,dr_ntu_url,schools,personal_site,dblp_links
19,Lin Guosheng,gslin@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00483,"[Assistant Professor, School of Computer Scien...",https://guosheng.github.io/,https://dblp.org/pid/126/4778
22,Yu Han,han.yu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00334,"[Nanyang Assistant Professor, School of Comput...",http://hanyu.sg,https://dblp.org/pid/35/1096-1
26,Liu Weichen,liu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00280,"[Nanyang Assistant Professor, School of Comput...",https://personal.ntu.edu.sg/liu/,https://dblp.org/pid/24/914
41,Lam Siew Kei,assklam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00901,"[Associate Professor, School of Computer Scien...",https://personal.ntu.edu.sg/assklam/,https://dblp.org/pid/74/1907
46,Loke Yuan Ren,yrloke@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00585,"[Lecturer, School of Computer Science and Engi...",https://personal.ntu.edu.sg/yrloke/,https://dblp.org/pid/11/9550
...,...,...,...,...,...,...
1346,Long Cheng,c.long@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00772,"[Assistant Professor, School of Computer Scien...",https://personal.ntu.edu.sg/c.long,https://dblp.org/pid/49/225
1368,Luo Siqiang （骆思强）,siqiang.luo@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01311,"[Assistant Professor, School of Computer Scien...",https://siqiangluo.wixsite.com/homepage,https://dblp.org/pid/117/5965
1391,Tang Xueyan,asxytang@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00075,"[Associate Professor, School of Computer Scien...",https://personal.ntu.edu.sg/asxytang/,https://dblp.org/pid/23/2460
1407,Erik Cambria,cambria@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00927,"[Associate Professor, School of Computer Scien...",https://sentic.net,https://dblp.org/pid/80/7421


## Extract number of citations by each scse researcher
- this section also doesn't use crawler because of crawling restriction
- Num citations are extracted from google scholar e.g https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=Erik+Cambria+ntu&btnG=
- steps are similar to the previous section

In [16]:
def get_citations(name):
    soup  = get_soup(name)
    authors = soup.find("div",{"id":"gsc_sa_ccl"}).find_all("div",{"class":"gsc_1usr"})
    if len(authors) == 0:
        # if no results clean the name, try again
        soup  = get_soup(ugly_clean(name))
        authors = soup.find("div",{"id":"gsc_sa_ccl"}).find_all("div",{"class":"gsc_1usr"})
        
    # save the first as the most likely solution
    first_cited_by = None
    if len(authors) >= 1:
        first_cited_by = authors[0].find("div",{"class":"gs_ai_cby"})
        first_cited_by = first_cited_by.text if first_cited_by is not None else ""
    # if there's only one option, return 
    if len(authors) == 1:
        return first_cited_by
    # if there are multiple, check if its ntu, then return  
    for a in authors:
        school = a.find("div",{"class":"gs_ai_aff"})
        school = school.text if school is not None else ""
        verified_at = a.find("div",{"class":"gs_ai_eml"})
        verified_at = verified_at.text if verified_at is not None else ""
        if "Nanyang Technological University" in school or "ntu.edu.sg" in verified_at:
            cited_by = a.find("div",{"class":"gs_ai_cby"})
            cited_by = cited_by.text if cited_by is not None else None
            return cited_by
    # if nothing fits, return first 
    return first_cited_by if first_cited_by is not None else None

def get_soup(name):
    url = get_google_scholar_url(name)
    r = requests.get(url)
    soup  = BeautifulSoup(r.text, "html.parser")
    return soup
def get_google_scholar_url(name, additional_keywords = ["ntu"]):
    keywords = name.strip().split()+additional_keywords
    return "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors="+"+".join(keywords)

In [17]:
import re
citations = []
for name in tqdm(names):
    name = re.sub(r'[^a-zA-Z ]', '', name)
    print(name)
    c = get_citations(name)
    citations.append(c)

  0%|                                                                                           | 0/82 [00:00<?, ?it/s]

Lin Guosheng


  1%|█                                                                                  | 1/82 [00:00<00:53,  1.52it/s]

Yu Han


  2%|██                                                                                 | 2/82 [00:03<02:29,  1.87s/it]

Liu Weichen


  4%|███                                                                                | 3/82 [00:06<03:21,  2.56s/it]

Lam Siew Kei


  5%|████                                                                               | 4/82 [00:08<02:51,  2.20s/it]

Loke Yuan Ren


  6%|█████                                                                              | 5/82 [00:11<03:15,  2.53s/it]

Lam Kwok Yan


  7%|██████                                                                             | 6/82 [00:13<02:49,  2.24s/it]

Guan Cuntai


  9%|███████                                                                            | 7/82 [00:14<02:33,  2.04s/it]

Anwitaman Datta


 10%|████████                                                                           | 8/82 [00:16<02:21,  1.92s/it]

Lin Weisi


 11%|█████████                                                                          | 9/82 [00:18<02:13,  1.84s/it]

Liu Ziwei


 12%|██████████                                                                        | 10/82 [00:19<02:07,  1.77s/it]

Chee Wei Tan


 13%|███████████                                                                       | 11/82 [00:20<01:38,  1.39s/it]

Fan Xiuyi


 15%|████████████                                                                      | 12/82 [00:21<01:43,  1.47s/it]

Anupam Chattopadhyay


 16%|█████████████                                                                     | 13/82 [00:23<01:45,  1.53s/it]

Zhao Jun


 17%|██████████████                                                                    | 14/82 [00:26<02:20,  2.06s/it]

Quek Hiok Chai


 18%|███████████████                                                                   | 15/82 [00:30<02:43,  2.43s/it]

Sun Aixin


 20%|████████████████                                                                  | 16/82 [00:31<02:25,  2.20s/it]

A S Madhukumar


 21%|█████████████████                                                                 | 17/82 [00:33<02:12,  2.03s/it]

Deepu Rajan


 22%|██████████████████                                                                | 18/82 [00:35<02:02,  1.92s/it]

Zheng Jianmin


 23%|███████████████████                                                               | 19/82 [00:37<02:06,  2.01s/it]

Lin ShangWei


 24%|████████████████████                                                              | 20/82 [00:37<01:35,  1.53s/it]

Wentong Cai


 26%|█████████████████████                                                             | 21/82 [00:40<01:46,  1.75s/it]

Li Yi


 27%|██████████████████████                                                            | 22/82 [00:42<01:54,  1.90s/it]

Chen Change Loy


 28%|███████████████████████                                                           | 23/82 [00:43<01:47,  1.83s/it]

Cham Tat Jen


 29%|████████████████████████                                                          | 24/82 [00:45<01:42,  1.77s/it]

Kwoh Chee Keong


 30%|█████████████████████████                                                         | 25/82 [00:47<01:38,  1.74s/it]

Smitha K G


 32%|██████████████████████████                                                        | 26/82 [00:50<01:59,  2.13s/it]

Tan Rui


 33%|███████████████████████████                                                       | 27/82 [00:51<01:49,  1.98s/it]

Cong Gao


 34%|████████████████████████████                                                      | 28/82 [00:55<02:16,  2.53s/it]

Seah Hock Soon


 35%|█████████████████████████████                                                     | 29/82 [00:57<02:08,  2.42s/it]

Luo Jun


 37%|██████████████████████████████                                                    | 30/82 [00:59<01:55,  2.22s/it]

Shen Zhiqi


 38%|███████████████████████████████                                                   | 31/82 [01:01<01:44,  2.05s/it]

He Ying


 39%|████████████████████████████████                                                  | 32/82 [01:03<01:42,  2.04s/it]

Sourav Saha Bhowmick


 40%|█████████████████████████████████                                                 | 33/82 [01:09<02:33,  3.14s/it]

Alexei Sourin


 41%|██████████████████████████████████                                                | 34/82 [01:10<02:09,  2.69s/it]

Miao Chun Yan


 43%|███████████████████████████████████                                               | 35/82 [01:12<01:51,  2.38s/it]

Yeo Chai Kiat


 44%|████████████████████████████████████                                              | 36/82 [01:13<01:39,  2.15s/it]

Chng Eng Siong


 45%|█████████████████████████████████████                                             | 37/82 [01:15<01:29,  2.00s/it]

Douglas Leslie Maskell


 46%|██████████████████████████████████████                                            | 38/82 [01:18<01:45,  2.40s/it]

Lin Feng


 48%|███████████████████████████████████████                                           | 39/82 [01:21<01:45,  2.46s/it]

Huang Shell Ying


 49%|████████████████████████████████████████                                          | 40/82 [01:23<01:33,  2.22s/it]

Arvind Easwaran


 50%|█████████████████████████████████████████                                         | 41/82 [01:24<01:23,  2.05s/it]

Chan Syin


 51%|██████████████████████████████████████████                                        | 42/82 [01:30<02:05,  3.13s/it]

Zhang Hanwang


 52%|███████████████████████████████████████████                                       | 43/82 [01:32<01:51,  2.86s/it]

Ong Yew Soon


 54%|████████████████████████████████████████████                                      | 44/82 [01:34<01:34,  2.50s/it]

Li Fang


 55%|█████████████████████████████████████████████                                     | 45/82 [01:35<01:14,  2.01s/it]

Tay Kian Boon


 56%|██████████████████████████████████████████████                                    | 46/82 [01:38<01:26,  2.39s/it]

Vanessa Evers


 57%|██████████████████████████████████████████████▉                                   | 47/82 [01:42<01:41,  2.89s/it]

Zhang Tianwei


 59%|████████████████████████████████████████████████                                  | 48/82 [01:44<01:25,  2.53s/it]

Li Mo


 60%|█████████████████████████████████████████████████                                 | 49/82 [01:46<01:16,  2.33s/it]

Ke Yiping Kelly


 61%|██████████████████████████████████████████████████                                | 50/82 [01:50<01:31,  2.85s/it]

Zhang Jie


 62%|███████████████████████████████████████████████████                               | 51/82 [01:52<01:22,  2.66s/it]

Wen Yonggang


 63%|████████████████████████████████████████████████████                              | 52/82 [01:54<01:16,  2.55s/it]

Chia Liang Tien


 65%|█████████████████████████████████████████████████████                             | 53/82 [01:56<01:06,  2.28s/it]

Dusit Niyato


 66%|██████████████████████████████████████████████████████                            | 54/82 [01:58<00:58,  2.09s/it]

Lau Chiew Tong


 67%|███████████████████████████████████████████████████████                           | 55/82 [02:01<01:06,  2.45s/it]

Goh Wooi Boon


 68%|████████████████████████████████████████████████████████                          | 56/82 [02:02<00:57,  2.21s/it]

Hui Siu Cheung


 70%|████████████████████████████████████████████████████████▉                         | 57/82 [02:04<00:51,  2.04s/it]

Thambipillai Srikanthan


 71%|██████████████████████████████████████████████████████████                        | 58/82 [02:07<00:58,  2.42s/it]

Wee Keong NG


 72%|███████████████████████████████████████████████████████████                       | 59/82 [02:12<01:10,  3.05s/it]

Jagath Chandana Rajapakse


 73%|████████████████████████████████████████████████████████████                      | 60/82 [02:15<01:08,  3.12s/it]

Bo An


 74%|█████████████████████████████████████████████████████████████                     | 61/82 [02:16<00:50,  2.40s/it]

Pan Sinno Jialin


 76%|██████████████████████████████████████████████████████████████                    | 62/82 [02:18<00:43,  2.18s/it]

Mohamed M Sabry


 77%|███████████████████████████████████████████████████████████████                   | 63/82 [02:21<00:46,  2.46s/it]

Joty Shafiq Rayhan


 78%|████████████████████████████████████████████████████████████████                  | 64/82 [02:24<00:50,  2.78s/it]

Lu Shijian


 79%|█████████████████████████████████████████████████████████████████                 | 65/82 [02:26<00:42,  2.49s/it]

Oh Hong Lye


 80%|██████████████████████████████████████████████████████████████████                | 66/82 [02:28<00:35,  2.24s/it]

Owen Noel Newton Fernando


 82%|███████████████████████████████████████████████████████████████████               | 67/82 [02:29<00:30,  2.06s/it]

Lana Obraztsova


 83%|████████████████████████████████████████████████████████████████████              | 68/82 [02:32<00:29,  2.09s/it]

Zinovi Rabinovich


 84%|█████████████████████████████████████████████████████████████████████             | 69/82 [02:33<00:25,  1.96s/it]

Luke Ong 


 85%|██████████████████████████████████████████████████████████████████████            | 70/82 [02:37<00:28,  2.40s/it]

Vidya Sudarshan


 87%|███████████████████████████████████████████████████████████████████████           | 71/82 [02:38<00:23,  2.18s/it]

Luu Anh Tuan


 88%|████████████████████████████████████████████████████████████████████████          | 72/82 [02:39<00:17,  1.72s/it]

Li Boyang


 89%|█████████████████████████████████████████████████████████████████████████         | 73/82 [02:40<00:12,  1.43s/it]

Qian Kemao


 90%|██████████████████████████████████████████████████████████████████████████        | 74/82 [02:41<00:11,  1.49s/it]

Vun Chan Hua Nicholas


 91%|███████████████████████████████████████████████████████████████████████████       | 75/82 [02:45<00:15,  2.18s/it]

Lee Bu Sung


 93%|████████████████████████████████████████████████████████████████████████████      | 76/82 [02:48<00:13,  2.33s/it]

Wai Kin Adams Kong


 94%|█████████████████████████████████████████████████████████████████████████████     | 77/82 [02:52<00:13,  2.77s/it]

Long Cheng


 95%|██████████████████████████████████████████████████████████████████████████████    | 78/82 [02:55<00:12,  3.10s/it]

Luo Siqiang 


 96%|███████████████████████████████████████████████████████████████████████████████   | 79/82 [02:57<00:07,  2.67s/it]

Tang Xueyan


 98%|████████████████████████████████████████████████████████████████████████████████  | 80/82 [03:03<00:07,  3.73s/it]

Erik Cambria


 99%|█████████████████████████████████████████████████████████████████████████████████ | 81/82 [03:05<00:03,  3.10s/it]

Liu Yang


100%|██████████████████████████████████████████████████████████████████████████████████| 82/82 [03:08<00:00,  2.30s/it]


In [43]:
def clean_citations(string):
    if string is None or string == "":
        return pd.NA
    return int(string.split()[-1])

In [44]:
scse['num_citations'] = list(map(clean_citations, citations))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scse['num_citations'] = list(map(clean_citations, citations))


## output
- sort by emails

In [51]:
cols = ["full_name","email","dr_ntu_url","personal_site", "dblp_links", "num_citations"]
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4125 - Developing Data Products\Assignments\Individual Assignment\solution.csv"
scse = scse.sort_values("email")
scse[cols]

Unnamed: 0,full_name,email,dr_ntu_url,personal_site,dblp_links,num_citations
1342,Wai Kin Adams Kong,adamskong@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00834,,https://dblp.org/pid/16/3792,
1240,Luu Anh Tuan,anhtuan.luu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01296,https://tuanluu.github.io/,https://dblp.org/pid/81/8329,2936
217,Anupam Chattopadhyay,anupam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01076,https://scholar.google.co.in/citations?user=TI...,https://dblp.org/pid/99/4535,4875
76,Anwitaman Datta,anwitaman@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00706,https://personal.ntu.edu.sg/anwitaman/,https://dblp.org/pid/d/AnwitamanDatta,7669
754,Arvind Easwaran,arvinde@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00687,https://cps-research-group.github.io/,https://dblp.org/pid/73/1708,2506
...,...,...,...,...,...,...
46,Loke Yuan Ren,yrloke@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00585,https://personal.ntu.edu.sg/yrloke/,https://dblp.org/pid/11/9550,13
857,Zhang Jie,zhangj@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00759,https://personal.ntu.edu.sg/zhangj/,https://dblp.org/pid/84/6889-2,10287
1225,Zinovi Rabinovich,zinovi@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00214,zinovi.net,https://dblp.org/pid/93/4009,941
119,Liu Ziwei,ziwei.liu@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00203,https://liuziwei7.github.io/,https://dblp.org/pid/05/6300-2,18681


In [53]:
scse[cols].to_csv(path, index = False)

## Other attempts:
- attempts at crawling dblp url and citations had very fast executions but debugging is slower and eventually there were site restrictions so BeautfulSoup was used instead

In [None]:
import scrapy
import pickle
import time

    

class CrawlDblpResearchersSpider(scrapy.Spider):
    name = 'crawl_dblp_researchers'
    allowed_domains = ['dblp.org']
    start_urls = ["https://scholar.google.com/schhp?hl=en"]
    
    # path to raw researcher names 
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4125 - Developing Data Products\Assignments\Individual Assignment\Part 1\researchers_names.pkl"
    
    
    def parse(self, response):
        # for each researcher name make a request
        names = self.get_researcher_names(CrawlDblpResearchersSpider.path)
        for name in names:
            link = self.get_dblp_link(name)
            meta = {
                "full_name":name,
                "link":link
            }
            print(meta["link"])
            yield response.follow(link, callback = self.parse_dblp, meta = meta)
            time.sleep(0.5)
            
    def parse_dblp(self, response):
        output = {
            "full_name":response.meta["full_name"],
            "link":response.meta["link"]
        }
        authors = response.css("div[id=completesearch-authors] ul.result-list")
        if len(authors)==1:
            dblp_link = authors.css("a::attr(href)").get()
            output["dblp_link"] = dblp_link
            output["multiple_results"] = False
            yield output
        else:
            for author in authors:
                description = " ".join(author.css("small::text").getall())
                if "Nanyang Technological University" in description:
                    dblp_link = author.css("a::attr(href)").get()
                    output["dblp_link"] = dblp_link
                    output["multiple_results"] = True
                    yield output
       
    def get_researcher_names(self, path):
        with open(path, "rb") as f:
            names = pickle.load(f)   
        return names
    def get_dblp_link(self, name):
        return "https://dblp.org/search?q="+"%20".join(name.strip().split())



In [None]:
import scrapy
import pickle


class CrawlResearcherCitationsSpider(scrapy.Spider):
    name = 'crawl_researcher_citations'
    allowed_domains = ['scholar.google.com']
    start_urls = ['http://scholar.google.com/']

    # path to raw researcher names 
    path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4125 - Developing Data Products\Assignments\Individual Assignment\Part 1\researchers_names.pkl"
    
    def parse(self, response):
        # for each researcher name make a request
        names = self.get_researcher_names(CrawlResearcherCitationsSpider.path)
        for name in names:
            link = self.get_google_scholar_link(name, additional_keywords = ["ntu"])
            meta = {
                "full_name":name,
                "link":link
            }
            
            yield response.follow(link, callback = self.parse_google_scholar, meta = meta)
    
    # 
    def parse_google_scholar(self, response):
        output = {
            "full_name":response.meta["full_name"],
            "link":response.meta["link"]
        }
        users = response.css("div[class=gsc_1usr]")
       
        for user in users:
            verified_at = user.css("div.gs_ai_eml span::text").get()
            organization = user.css("div.gs_ai_aff *::text").get()
            if verified_at=="ntu" or "Nanyang Technological University" in organization or "Singapore" in organization or "NTU" in organization:
                cited_by = user.css("div.gs_ai_cby::text").get()
                output["organization"] = organization
                output["verified_at"] = verified_at
                output["cited_by"] = cited_by
                yield output
                
                
                
    def get_researcher_names(self, path):
        with open(path, "rb") as f:
            names = pickle.load(f)   
        return names
    # add an addition "ntu" for greater accuracy
    def get_google_scholar_link(self, name, additional_keywords = []):
        keywords = name.strip().split()+additional_keywords 
        return "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors="+"+".join(keywords)

