# Imports

In [111]:
import nltk
import json
import requests
import numpy as np
import pandas as pd

# Activity 3

## 3.1 Sub-activity: Loading and pre-processing of text data

In [112]:
# Global variables

WIKIDATA_API_ENDPOINT = "https://www.wikidata.org/w/api.php"

PARAMS = {
        "format":"json",
        "formatversion": "2",
        "sitefilter": "enwiki"
    }

### Task 1

In [113]:
def get_turing_award_recipients():
    
    wikidata_ID_params = {
    "action":"query",
    "format":"json",
    "list":"search",
    "srprop":"sectiontitle",
    "srsearch": "haswbstatement:P166=Q185667",
    "formatversion": "2",
    "srlimit":100
    }

    wikidata_ID_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_ID_params)
    wikidata_ID_data = wikidata_ID_response.json()    
    wikidata_IDs = [entity_ID["title"] for entity_ID in wikidata_ID_data["query"]["search"]]

    return wikidata_IDs

In [114]:
wikidata_IDs = get_turing_award_recipients()

### Task 2

In [115]:
print(wikidata_IDs)

['Q80', 'Q3572699', 'Q92894', 'Q17457', 'Q92612', 'Q92638', 'Q92743', 'Q92824', 'Q181529', 'Q204815', 'Q578036', 'Q92794', 'Q92739', 'Q49823', 'Q92602', 'Q3571662', 'Q92626', 'Q92758', 'Q16080922', 'Q62870', 'Q8556', 'Q92604', 'Q357965', 'Q11609', 'Q92609', 'Q439245', 'Q92670', 'Q92819', 'Q92851', 'Q92613', 'Q62874', 'Q92854', 'Q92628', 'Q7143512', 'Q62861', 'Q320624', 'Q45575', 'Q1107006', 'Q92614', 'Q62888', 'Q93080', 'Q476466', 'Q92820', 'Q92649', 'Q62898', 'Q92641', 'Q92742', 'Q93154', 'Q62843', 'Q92643', 'Q92823', 'Q462089', 'Q62866', 'Q92629', 'Q92618', 'Q92822', 'Q92596', 'Q92746', 'Q918650', 'Q62857', 'Q92619', 'Q92821', 'Q62877', 'Q92782', 'Q92632', 'Q93161', 'Q92744', 'Q92606', 'Q92781', 'Q9602', 'Q92625', 'Q62894', 'Q92644', 'Q92745', 'Q92828']


In [116]:
def get_wikipedia_content(wikidata_ID):
    wikipedia_API_endpoint = "https://en.wikipedia.org/w/api.php"

    WIKIDATA_GET_CONTENT_PARAMS = {
        "action":"wbgetentities",
        "format":"json",
        "ids": wikidata_ID,
        "formatversion": "2",
        "sitefilter": "enwiki"
    }
    
    wikidata_response = requests.get(WIKIDATA_API_ENDPOINT, params = WIKIDATA_GET_CONTENT_PARAMS)
    wikidata_response_data = wikidata_response.json()

    # To extract content from the wikipidia page, we have to use titles gained the wikidata IDs, since the titles of wikipedia pages are unique.
    wikidata_title = wikidata_response_data["entities"][wikidata_ID]["sitelinks"]["enwiki"]["title"]

    wikipedia_content_params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "titles": wikidata_title,
        "formatversion": "2",
        "explaintext": 1,
        "exsectionformat": "wiki",
    }

    wikipedia_content_response = requests.get(wikipedia_API_endpoint, params = wikipedia_content_params)
    wikipedia_content_data = wikipedia_content_response.json()
    wikipedia_content = wikipedia_content_data["query"]["pages"][0]["extract"]
    return wikipedia_content


### Task 3

In [119]:
def get_award_winners_info(wikidata_IDs):
    wikidata_names = []
    wikipedia_intros = []
    wikidata_genders = []
    wikidata_birth_dates = []
    wikidata_birth_places = []
    wikidata_employers = []
    wikidata_educated_ats = []

    for wikidata_ID in wikidata_IDs:

        wikidata_title_params = {
            "action":"wbgetentities",
            "format":"json",
            "ids": wikidata_ID,
            "props": "labels|claims",
            "formatversion": "2",
            "languages": "en",
            "sitefilter": "enwiki"
        }
        wikidata_title_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_title_params)
        wikidata_title_data = wikidata_title_response.json()

        # Extract name
        try:
            wikidata_name = wikidata_title_data["entities"][wikidata_ID]["labels"]["en"]["value"]
            wikidata_names.append(wikidata_name)
        except:
            wikidata_name = None

        # # Extract intro from wikipedia page
        # try:
        #     wikipedia_content = get_wikipedia_content(wikidata_ID)
        #     wikipedia_intro = wikipedia_content.split("\n")[0]
        #     wikipedia_intros.append(wikipedia_intro)
        # except:
        #     wikipedia_intro = None
        
        # Extract gender ID to get gender from "sex or gender (P21)"
        try:
            wikidata_gender_id = wikidata_title_data["entities"][wikidata_ID]["claims"]["P21"][0]["mainsnak"]["datavalue"]["value"]["id"]
        except:
            wikidata_gender_id = None
            
        # Get birth date from "date of birth (P569)"
        try:
            wikidata_birth_date = wikidata_title_data["entities"][wikidata_ID]["claims"]["P569"][0]["mainsnak"]["datavalue"]["value"]["time"].split("T")[0].split("+")[1]
            wikidata_birth_dates.append(wikidata_birth_date)
        except:
            wikidata_birth_date = None

        # Extract birth place ID to get birth place from "place of birth (P19)"
        try:
            wikidata_birth_place = wikidata_title_data["entities"][wikidata_ID]["claims"]["P19"][0]["mainsnak"]["datavalue"]["value"]["id"]
        except:
            wikidata_birth_place = None

        # Extract employer ID to get employer from "employer (P108)"
        # employer ID is inside of "mainsnak" key
        try:
            wikidata_employer_mainsnaks = wikidata_title_data["entities"][wikidata_ID]["claims"]["P108"]
        except:
            wikidata_employers_IDs = None
        else:
            wikidata_employers_IDs = [wikidata_employer_ID["mainsnak"]["datavalue"]["value"]["id"] for wikidata_employer_ID in wikidata_employer_mainsnaks]

        try:
            wikidata_educated_at_mainsnaks = wikidata_title_data["entities"][wikidata_ID]["claims"]["P69"]
        except:
            wikidata_educated_at_IDs = None
        else:
            wikidata_educated_at_IDs = [wikidata_educated_at_ID["mainsnak"]["datavalue"]["value"]["id"] for wikidata_educated_at_ID in wikidata_educated_at_mainsnaks]

        entity_info_IDs = [wikidata_gender_id, wikidata_birth_place, "|".join(wikidata_employers_IDs), "|".join(wikidata_educated_at_IDs)]
        # print(entity_info_IDs)
            
    for entity_info_ID in entity_info_IDs:
        wikidata_title_params_2 = {
            "action":"wbgetentities",
            "format":"json",
            "ids": entity_info_ID,
            "props": "labels",
            "formatversion": "2",
            "languages": "en",
            "sitefilter": "enwiki"
        }

        entity_info_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_title_params_2)
        entity_info_data = entity_info_response.json()
        entity_IDs = entity_info_ID.split("|")
        for entity_ID in entity_IDs:
            entity_info = entity_info_data["entities"][entity_ID]["labels"]["en"]["value"]
            print(entity_info)

    return wikidata_names, wikidata_birth_dates
# award_winners = {}

In [121]:
award_winners = {}
award_winners["name"], award_winners["birth_date"] = get_award_winners_info(wikidata_IDs)
print(award_winners)

{'name': ['Tim Berners-Lee', 'Yoshua Bengio', 'Geoffrey Hinton', 'Donald Knuth', 'Richard M. Karp', 'Robert Tarjan', 'Vint Cerf', 'Judea Pearl', 'Herbert Simon', 'Marvin Minsky', 'Ron Rivest', 'Jeffrey David Ullman', 'John McCarthy', 'Dana Scott', 'Tony Hoare', 'Yann LeCun', 'Manuel Blum', 'Michael Stonebraker', 'Barbara Liskov', 'Stephen Cook', 'Edsger W. Dijkstra', 'Niklaus Wirth', 'Michael O. Rabin', 'Shafrira Goldwasser', 'Fred Brooks', 'Allen Newell', 'Jack Dongarra', 'Edmund M. Clarke', 'David A. Patterson', 'Leslie Lamport', 'John Edward Hopcroft', 'John L. Hennessy', 'Juris Hartmanis', 'Pat Hanrahan', 'Alan Perlis', 'Adi Shamir', 'Dennis M. Ritchie', 'Ken Thompson', 'Douglas Engelbart', 'Andrew Yao', 'Silvio Micali', 'Martin Edward Hellman', 'Raj Reddy', 'Amir Pnueli', 'Alfred Aho', 'Robert W. Floyd', 'Alan Kay', 'Leslie Valiant', 'Bob Kahn', 'Robin Milner', 'Edward Feigenbaum', 'Whitfield Diffie', 'Ivan Sutherland', 'Kenneth E. Iverson', 'Peter Naur', 'Richard E. Stearns', 'Ed

### Task 4

In [134]:
print("The names of all award winners are (alphabetical order): \n\n{}.".format(", ".join(sorted(award_winners["name"]))))

The names of all award winners are (alphabetical order): 

Adi Shamir, Alan Kay, Alan Perlis, Alfred Aho, Allen Newell, Amir Pnueli, Andrew Yao, Barbara Liskov, Bob Kahn, Butler Lampson, Charles Bachman, Charles P. Thacker, Dana Scott, David A. Patterson, Dennis M. Ritchie, Donald Knuth, Douglas Engelbart, E. Allen Emerson, Edgar F. Codd, Edmund M. Clarke, Edsger W. Dijkstra, Edward Feigenbaum, Edwin Catmull, Fernando J. Corbató, Frances E. Allen, Fred Brooks, Geoffrey Hinton, Herbert Simon, Iosif Sifakis, Ivan Sutherland, Jack Dongarra, James H. Wilkinson, Jeffrey David Ullman, Jim Gray, John Backus, John Cocke, John Edward Hopcroft, John L. Hennessy, John McCarthy, Judea Pearl, Juris Hartmanis, Ken Thompson, Kenneth E. Iverson, Kristen Nygaard, Leonard Adleman, Leslie Lamport, Leslie Valiant, Manuel Blum, Martin Edward Hellman, Marvin Minsky, Maurice Wilkes, Michael O. Rabin, Michael Stonebraker, Niklaus Wirth, Ole-Johan Dahl, Pat Hanrahan, Peter Naur, Raj Reddy, Richard E. Stearns, 

### Task 5

In [136]:
award_winners_intro = pd.DataFrame(award_winners)
award_winners_intro["winner_name"] = np.nan
award_winners_intro["count_words"] = np.nan
award_winners_intro["count_sentences"] = np.nan
award_winners_intro["count_paragraphs"] = np.nan
award_winners_intro["common_words"] = np.nan


print(award_winners_intro)

                   name  birth_date  winner_name  count_words  \
0       Tim Berners-Lee  1955-06-08          NaN          NaN   
1         Yoshua Bengio  1964-03-05          NaN          NaN   
2       Geoffrey Hinton  1947-12-06          NaN          NaN   
3          Donald Knuth  1938-01-10          NaN          NaN   
4       Richard M. Karp  1935-01-03          NaN          NaN   
..                  ...         ...          ...          ...   
70  Fernando J. Corbató  1926-07-01          NaN          NaN   
71      Charles Bachman  1924-12-11          NaN          NaN   
72       Butler Lampson  1943-12-23          NaN          NaN   
73       Ole-Johan Dahl  1931-10-12          NaN          NaN   
74   Charles P. Thacker  1943-02-26          NaN          NaN   

    count_sentences  count_paragraphs  common_words  
0               NaN               NaN           NaN  
1               NaN               NaN           NaN  
2               NaN               NaN           NaN  
3  

## 3.2 Sub-activity: Applying NLP operations on the corpus

### 3.2.1 Stemming

### Task 3

### Task 4

### 3.2.2 Lemmatization

### Task 5

### 3.2.3 Finding synonyms and antonyms

### Task 6

### 3.2.4 Bigrams and trigrams

### Task 7

### Task 8

### Task 9

### Task 10

## 3.3 Sub-section: Visualisation

### 3.3.1 Barplots

### Task 11

### Task 12

### Task 13

### 3.3.2 Heatmap

### Task 14