# Imports

In [9]:
import nltk
import requests
import numpy as np
import pandas as pd

# Activity 3

## 3.1 Sub-activity: Loading and pre-processing of text data

In [10]:
# Global variables

WIKIDATA_API_ENDPOINT = "https://www.wikidata.org/w/api.php"

PARAMS = {
        "format":"json",
        "formatversion": "2",
        "sitefilter": "enwiki"
    }

### Task 1

In [11]:
def get_turing_award_recipients():
    
    wikidata_ID_params = {
    "action":"query",
    "format":"json",
    "list":"search",
    "srprop":"sectiontitle",
    "srsearch": "haswbstatement:P166=Q185667",
    "formatversion": "2",
    "srlimit":100
    }

    wikidata_ID_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_ID_params)
    wikidata_ID_data = wikidata_ID_response.json()    
    wikidata_IDs = [entity_ID["title"] for entity_ID in wikidata_ID_data['query']['search']]

    return wikidata_IDs

In [12]:
wikidata_IDs = get_turing_award_recipients()

### Task 2

In [13]:
print(wikidata_IDs)

['Q80', 'Q3572699', 'Q92894', 'Q17457', 'Q92612', 'Q92638', 'Q92743', 'Q92824', 'Q181529', 'Q204815', 'Q578036', 'Q92794', 'Q92739', 'Q49823', 'Q92602', 'Q3571662', 'Q92626', 'Q92758', 'Q16080922', 'Q62870', 'Q8556', 'Q92604', 'Q357965', 'Q11609', 'Q92609', 'Q439245', 'Q92670', 'Q92819', 'Q92851', 'Q92613', 'Q62874', 'Q92854', 'Q92628', 'Q7143512', 'Q62861', 'Q320624', 'Q45575', 'Q1107006', 'Q92614', 'Q62888', 'Q93080', 'Q476466', 'Q92820', 'Q92649', 'Q62898', 'Q92641', 'Q92742', 'Q93154', 'Q62843', 'Q92643', 'Q92823', 'Q462089', 'Q62866', 'Q92629', 'Q92618', 'Q92822', 'Q92596', 'Q92746', 'Q918650', 'Q62857', 'Q92619', 'Q92821', 'Q62877', 'Q92782', 'Q92632', 'Q93161', 'Q92744', 'Q92606', 'Q92781', 'Q9602', 'Q92625', 'Q62894', 'Q92644', 'Q92745', 'Q92828']


In [14]:
def get_wikipedia_content(wikidata_ID):
    wikipedia_API_endpoint = "https://en.wikipedia.org/w/api.php"

    WIKIDATA_GET_CONTENT_PARAMS = {
        "action":"wbgetentities",
        "format":"json",
        "ids": wikidata_ID,
        "formatversion": "2",
        "sitefilter": "enwiki"
    }
    
    wikidata_response = requests.get(WIKIDATA_API_ENDPOINT, params = WIKIDATA_GET_CONTENT_PARAMS).json()
    wikidata_response_data = wikidata_response.json()

    # To extract content from the wikipidia page, we have to use titles gained the wikidata IDs, since the titles of wikipedia pages are unique.
    wikidata_title = wikidata_response_data['entities'][wikidata_ID]['sitelinks']['enwiki']['title']

    wikipedia_content_params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "titles": wikidata_title,
        "formatversion": "2",
        "explaintext": 1,
        "exsectionformat": "wiki"
    }

    wikipedia_content_response = requests.get(wikipedia_API_endpoint, params = wikipedia_content_params)
    wikipedia_content_data = wikipedia_content_response.json()
    wikipedia_content = wikipedia_content_data['query']['pages'][0]['extract']
    return wikipedia_content


### Task 3

In [15]:
def get_award_winners_info(wikidata_ID):
    award_winners = {}
    wikidata_title_params = {
        "action":"wbgetentities",
        "format":"json",
        "ids": wikidata_ID,
        "props": "labels|claims",
        "formatversion": "2",
        "sitefilter": "enwiki"
    }
    wikidata_title_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_title_params)
    wikidata_title_data = wikidata_title_response.json()

    # Extract name 
    wikidata_name = wikidata_title_data['entities'][wikidata_ID]['labels']['en']['value']
    award_winners["name"] = wikidata_name

    # Extract gender ID to get gender from "sex or gender (P21)"
    wikidata_gender_id = wikidata_title_data['entities'][wikidata_ID]['claims']['P21'][0]['mainsnak']['datavalue']['value']['id']

    # Get birth date from "date of birth (P569)"
    try:
        wikidata_birth_date = wikidata_title_data['entities'][wikidata_ID]['claims']['P569'][0]['mainsnak']['datavalue']['value']['time']
    except:
        wikidata_birth_date = np.nan

    # Extract birth place ID to get birth place from "place of birth (P19)"
    try:
        wikidata_birth_place = wikidata_title_data['entities'][wikidata_ID]['claims']['P19'][0]['mainsnak']['datavalue']['value']['id']
    except:
        wikidata_birth_place = np.nan

    # Extract employer ID to get employer from "employer (P108)"
    # employer ID is inside of "mainsnak" key
    try:
        wikidata_employer_mainsnaks = wikidata_title_data['entities'][wikidata_ID]['claims']['P108']
    except:
        wikidata_employers_IDs = np.nan
    else:
        wikidata_employers_IDs = [wikidata_employer_ID['mainsnak']['datavalue']['value']['id'] for wikidata_employer_ID in wikidata_employer_mainsnaks]

    try:
        wikidata_educated_at_mainsnaks = wikidata_title_data['entities'][wikidata_ID]['claims']['P69']
    except:
        wikidata_educated_at_IDs = np.nan
    else:
        wikidata_educated_at_IDs = [wikidata_educated_at_ID['mainsnak']['datavalue']['value']['id'] for wikidata_educated_at_ID in wikidata_educated_at_mainsnaks]

    return wikidata_name, wikidata_gender_id, wikidata_birth_date, wikidata_birth_place, wikidata_employers_IDs, wikidata_educated_at_IDs

# award_winners = {}

In [16]:
for wikidata_ID in wikidata_IDs:
    a = get_award_winners_info(wikidata_ID)
    print(a)

('Tim Berners-Lee', 'Q6581097', '+1955-06-08T00:00:00Z', 'Q84', ['Q37033', 'Q7432436', 'Q49108', 'Q7204713', 'Q42944', 'Q42944', 'Q7095994'], ['Q73094', 'Q5369138'])
('Yoshua Bengio', 'Q6581097', '+1964-03-05T00:00:00Z', 'Q90', ['Q392189'], ['Q201492', 'Q201492', 'Q201492', 'Q49108'])
('Geoffrey Hinton', 'Q6581097', '+1947-12-06T00:00:00Z', 'Q736742', ['Q180865', 'Q95', 'Q190080'], ['Q160302'])
('Donald Knuth', 'Q6581097', '+1938-01-10T00:00:00Z', 'Q37836', ['Q41506', 'Q256593', 'Q1665138', 'Q161562'], ['Q1047060', 'Q161562', 'Q6861763'])
('Richard M. Karp', 'Q6581097', '+1935-01-03T00:00:00Z', 'Q100', ['Q168756', 'Q219563'], ['Q13371', 'Q5676553', 'Q168756'])
('Robert Tarjan', 'Q6581097', '+1948-04-30T00:00:00Z', 'Q486868', ['Q21578', 'Q49108', 'Q49210', 'Q49115', 'Q168756', 'Q41506', 'Q217365', 'Q80978'], ['Q161562', 'Q41506'])
('Vint Cerf', 'Q6581097', '+1943-06-23T00:00:00Z', 'Q49145', ['Q95', 'Q37156', 'Q41506', 'Q207361'], ['Q41506', 'Q174710', 'Q174710', 'Q4008476'])
('Judea Pea

### Task 4

### Task 5

## 3.2 Sub-activity: Applying NLP operations on the corpus

### 3.2.1 Stemming

### Task 3

### Task 4

### 3.2.2 Lemmatization

### Task 5

### 3.2.3 Finding synonyms and antonyms

### Task 6

### 3.2.4 Bigrams and trigrams

### Task 7

### Task 8

### Task 9

### Task 10

## 3.3 Sub-section: Visualisation

### 3.3.1 Barplots

### Task 11

### Task 12

### Task 13

### 3.3.2 Heatmap

### Task 14