# Imports

In [446]:
import re
import nltk
import json
import requests
import numpy as np
import pandas as pd
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize


# Activity 3

## 3.1 Sub-activity: Loading and pre-processing of text data

In [447]:
# Global variables

WIKIDATA_API_ENDPOINT = "https://www.wikidata.org/w/api.php"

PARAMS = {
        "format":"json",
        "formatversion": "2",
        "sitefilter": "enwiki"
    }

### Task 1

In [448]:
def get_turing_award_recipients():
    
    wikidata_ID_params = {
    "action":"query",
    "format":"json",
    "list":"search",
    "srprop":"sectiontitle",
    "srsearch": "haswbstatement:P166=Q185667",
    "formatversion": "2",
    "srlimit":100
    }

    wikidata_ID_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_ID_params)
    wikidata_ID_data = wikidata_ID_response.json()    
    wikidata_IDs = [entity_ID["title"] for entity_ID in wikidata_ID_data["query"]["search"]]

    return wikidata_IDs

In [449]:
wikidata_IDs = get_turing_award_recipients()

### Task 2

In [450]:
print(wikidata_IDs)

['Q80', 'Q3572699', 'Q92894', 'Q17457', 'Q92612', 'Q92638', 'Q92743', 'Q92824', 'Q181529', 'Q204815', 'Q578036', 'Q92794', 'Q92739', 'Q49823', 'Q92602', 'Q3571662', 'Q92626', 'Q92758', 'Q16080922', 'Q62870', 'Q8556', 'Q92604', 'Q357965', 'Q11609', 'Q92609', 'Q439245', 'Q92670', 'Q92819', 'Q92851', 'Q92613', 'Q62874', 'Q92854', 'Q92628', 'Q7143512', 'Q62861', 'Q320624', 'Q45575', 'Q1107006', 'Q92614', 'Q62888', 'Q93080', 'Q476466', 'Q92820', 'Q92649', 'Q62898', 'Q92641', 'Q92742', 'Q93154', 'Q62843', 'Q92643', 'Q92823', 'Q462089', 'Q62866', 'Q92629', 'Q92618', 'Q92822', 'Q92596', 'Q92746', 'Q918650', 'Q62857', 'Q92619', 'Q92821', 'Q62877', 'Q92782', 'Q92632', 'Q93161', 'Q92744', 'Q92606', 'Q92781', 'Q9602', 'Q92625', 'Q62894', 'Q92644', 'Q92745', 'Q92828']


In [451]:
def get_wikipedia_content(wikidata_ID):
    wikipedia_API_endpoint = "https://en.wikipedia.org/w/api.php"

    WIKIDATA_GET_CONTENT_PARAMS = {
        "action":"wbgetentities",
        "format":"json",
        "ids": wikidata_ID,
        "formatversion": "2",
        "sitefilter": "enwiki"
    }
    
    wikidata_response = requests.get(WIKIDATA_API_ENDPOINT, params = WIKIDATA_GET_CONTENT_PARAMS)
    wikidata_response_data = wikidata_response.json()

    # To extract content from the wikipidia page, we have to use titles gained the wikidata IDs, since the titles of wikipedia pages are unique.
    wikidata_title = wikidata_response_data["entities"][wikidata_ID]["sitelinks"]["enwiki"]["title"]

    wikipedia_content_params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "titles": wikidata_title,
        "formatversion": "2",
        "exsectionformat": "wiki",
    }

    wikipedia_content_response = requests.get(wikipedia_API_endpoint, params = wikipedia_content_params)
    wikipedia_content_data = wikipedia_content_response.json()
    wikipedia_content = wikipedia_content_data["query"]["pages"][0]["extract"]
    return wikipedia_content


In [452]:
# Q92743 = get_wikipedia_content("Q92743")
# Q80 = get_wikipedia_content("Q80")

In [453]:
# def get_paragraphs(content):
#     cleaned_content = re.sub(r"\\|\n","", content)
#     content_with_p_tag = re.sub(r"<\/?(?!p)\w*\b[^>]*>", "", cleaned_content.split("<h2>")[0])
#     paragraphs = re.findall(r'<p>(.+?)</p>', content_with_p_tag)
#     paragraphs = " \n".join(paragraphs)
#     return paragraphs

In [454]:
# for wikidata_ID in wikidata_IDs:
#     content = get_wikipedia_content(wikidata_ID)
#     paragraphs = get_paragraphs(content)
#     print(paragraphs)
#     print("lol")

### Task 3

In [466]:
def get_award_winners_info(wikidata_ID):
    
    wikidata_params = {
        "action":"wbgetentities",
        "format":"json",
        "ids": wikidata_ID,
        "props": "claims|sitelinks",
        "formatversion": "2",
        "languages": "en",
        "sitefilter": "enwiki"
    }

    wikidata_title_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_params)
    wikidata_title_data = wikidata_title_response.json()

    # Extract name
    try:
        wikidata_name = wikidata_title_data["entities"][wikidata_ID]["sitelinks"]["enwiki"]["title"]
    except KeyError:
        wikidata_name = None

    # Extract intro from wikipedia page
    try:
        wikipedia_content = get_wikipedia_content(wikidata_ID)
        cleaned_content = re.sub(r"\\|\n","", wikipedia_content)
        content_with_p_tag = re.sub(r"<\/?(?!p)\w*\b[^>]*>", "", cleaned_content.split("<h2>")[0])
        paragraphs = re.findall(r'<p>(.+?)</p>', content_with_p_tag)
        wikipedia_intro = " \n".join(paragraphs)

    except KeyError:
        wikipedia_intro = None
    
    # Extract gender ID to get gender from "sex or gender (P21)"
    try:
        wikidata_gender_id = wikidata_title_data["entities"][wikidata_ID]["claims"]["P21"][0]["mainsnak"]["datavalue"]["value"]["id"]
    except KeyError:
        wikidata_gender_id = None
        
    # Get birth date from "date of birth (P569)"
    try:
        wikidata_birth_date = wikidata_title_data["entities"][wikidata_ID]["claims"]["P569"][0]["mainsnak"]["datavalue"]["value"]["time"].split("T")[0].split("+")[1]
    except KeyError:
        wikidata_birth_date = None

    # Extract birth place ID to get birth place from "place of birth (P19)"
    try:
        wikidata_birth_place = wikidata_title_data["entities"][wikidata_ID]["claims"]["P19"][0]["mainsnak"]["datavalue"]["value"]["id"]
    except KeyError:
        wikidata_birth_place = None

    # Extract employer ID to get employer from "employer (P108)"
    # employer ID is inside of "mainsnak" key
    try:
        wikidata_employer_mainsnaks = wikidata_title_data["entities"][wikidata_ID]["claims"]["P108"]
    except KeyError:
        wikidata_employers_IDs = None
    else:
        wikidata_employers_IDs = [wikidata_employer_ID["mainsnak"]["datavalue"]["value"]["id"] for wikidata_employer_ID in wikidata_employer_mainsnaks]

    # Extract educated at ID to get educated at from "educated at (P69)"
    # educated at ID is inside of "mainsnak" key
    try:
        wikidata_educated_at_mainsnaks = wikidata_title_data["entities"][wikidata_ID]["claims"]["P69"]
    except KeyError:
        wikidata_educated_at_IDs = None
    else:
        wikidata_educated_at_IDs = [wikidata_educated_at_ID["mainsnak"]["datavalue"]["value"]["id"] for wikidata_educated_at_ID in wikidata_educated_at_mainsnaks]

    entity_info_request_IDs = [wikidata_gender_id, wikidata_birth_place, "|".join(wikidata_employers_IDs), "|".join(wikidata_educated_at_IDs)]
    
    entity_info = [wikidata_name, wikipedia_intro, wikidata_birth_date]
    for entity_info_request_ID in entity_info_request_IDs:

        wikidata_params_2 = {
            "action":"wbgetentities",
            "format":"json",
            "ids": entity_info_request_ID,
            "props": "labels",
            "formatversion": "2",
            "languages": "en",
            "sitefilter": "enwiki"
        }
        
        entity_values = []
        entity_info_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_params_2)
        entity_info_data = entity_info_response.json()

        try:
            entity_IDs = entity_info_request_ID.split("|")
        except AttributeError:
            entity_IDs = []
        
        for entity_ID in entity_IDs:
            try:
                entity_value = entity_info_data["entities"][entity_ID]["labels"]["en"]["value"]
                entity_values.append(entity_value)
            except KeyError:
                entity_values.append(None)
        
        # print(entity_values)
        entity_info.append(entity_values)
    
    return entity_info[0], entity_info[1], entity_info[2], entity_info[3], entity_info[4], entity_info[5], entity_info[6]


In [467]:
award_winners = {
    "name": [],
    "intro": [],
    "birth_date": [],
    "gender": [],
    "birth_place": [],
    "employer": [],
    "educated_at": []
}

for wikidata_ID in wikidata_IDs:
    wikidata_name, wikipedia_intro, wikidata_birth_date, wikidata_gender, wikidata_birth_place, wikidata_employer, wikidata_educated_at = get_award_winners_info(wikidata_ID)
    award_winners["name"].append(wikidata_name)
    award_winners["intro"].append(wikipedia_intro)
    award_winners["birth_date"].append(wikidata_birth_date)
    award_winners["gender"].append(wikidata_gender)
    award_winners["birth_place"].append(wikidata_birth_place)
    award_winners["employer"].append(wikidata_employer)
    award_winners["educated_at"].append(wikidata_educated_at)

KeyboardInterrupt: 

In [None]:
a = pd.DataFrame(award_winners)
a

Unnamed: 0,name,intro,birth_date,gender,birth_place,employer,educated_at
0,Tim Berners-Lee,Sir Timothy John Berners-Lee (born 8 June 195...,1955-06-08,[male],[London],"[World Wide Web Consortium, School of Electron...","[The Queen's College, Emanuel School]"
1,Yoshua Bengio,"Yoshua Bengio (born March 5, 1964) is a Canad...",1964-03-05,[male],[Paris],[Université de Montréal],"[McGill University, McGill University, McGill ..."
2,Geoffrey Hinton,Geoffrey Everest Hinton (born 6 December 1947...,1947-12-06,[male],[Wimbledon],"[University of Toronto, Google, Carnegie Mello...",[University of Edinburgh]
3,Donald Knuth,Donald Ervin Knuth ( kə-NOOTH; born January 10...,1938-01-10,[male],[Milwaukee],"[Stanford University, Burroughs Corporation, I...","[Case Western Reserve University, California I..."
4,Richard M. Karp,"Richard Manning Karp (born January 3, 1935) is...",1935-01-03,[male],[Boston],"[University of California, Berkeley, Universit...","[Harvard University, Harvard School of Enginee..."
...,...,...,...,...,...,...,...
70,Fernando J. Corbató,"Fernando José ""Corby"" Corbató (July 1, 1926 – ...",1926-07-01,[male],[Oakland],[Massachusetts Institute of Technology],"[California Institute of Technology, Massachus..."
71,Charles Bachman,"Charles William Bachman III (December 11, 1924...",1924-12-11,[male],[Manhattan],"[Dow Chemical Company, General Electric, Honey...","[Michigan State University, University of Penn..."
72,Butler Lampson,"Butler W. Lampson, ForMemRS, (born December 23...",1943-12-23,[male],"[Washington, D.C.]","[PARC, Massachusetts Institute of Technology, ...","[Harvard University, University of California,..."
73,Ole-Johan Dahl,Ole-Johan Dahl (12 October 1931 – 29 June 2002...,1931-10-12,[male],[Mandal],[University of Oslo],[University of Oslo]


### Task 4

In [457]:
print("The names of all award winners are (alphabetical order): \n\n{}.".format(", ".join(sorted(award_winners["name"]))))

The names of all award winners are (alphabetical order): 

Adi Shamir, Alan Kay, Alan Perlis, Alfred Aho, Allen Newell, Amir Pnueli, Andrew Yao, Barbara Liskov, Bob Kahn, Butler Lampson, Charles Bachman, Charles P. Thacker, Dana Scott, David Patterson (computer scientist), Dennis Ritchie, Donald Knuth, Douglas Engelbart, E. Allen Emerson, Edgar F. Codd, Edmund M. Clarke, Edsger W. Dijkstra, Edward Feigenbaum, Edwin Catmull, Fernando J. Corbató, Frances Allen, Fred Brooks, Geoffrey Hinton, Herbert A. Simon, Ivan Sutherland, Jack Dongarra, James H. Wilkinson, Jeffrey Ullman, Jim Gray (computer scientist), John Backus, John Cocke (computer scientist), John Hopcroft, John L. Hennessy, John McCarthy (computer scientist), Joseph Sifakis, Judea Pearl, Juris Hartmanis, Ken Thompson, Kenneth E. Iverson, Kristen Nygaard, Leonard Adleman, Leslie Lamport, Leslie Valiant, Manuel Blum, Martin Hellman, Marvin Minsky, Maurice Wilkes, Michael O. Rabin, Michael Stonebraker, Niklaus Wirth, Ole-Johan Dahl

### Task 5

#### (a)

In [458]:
award_winners_intro = pd.DataFrame(award_winners["intro"], columns = ["intro"])
award_winners_intro["winner_name"] = np.nan
award_winners_intro["count_words"] = np.nan
award_winners_intro["count_sentences"] = np.nan
award_winners_intro["count_paragraphs"] = np.nan
award_winners_intro["common_words"] = np.nan

#### (b)

In [None]:
award_winners_intro["winner_name"] = award_winners["name"]
award_winners_intro["count_words"] = award_winners_intro["intro"].apply(lambda x: len(word_tokenize(x)))
award_winners_intro["count_sentences"] = award_winners_intro["intro"].apply(lambda x: len(sent_tokenize(x)))
award_winners_intro["count_paragraphs"] = award_winners_intro["intro"].apply(lambda x: len(x.split("\n")))
award_winners_intro["common_words"] = award_winners_intro["intro"].apply(lambda x: FreqDist(x.split()).most_common(10)).apply(lambda x: [i[0] for i in x])

In [443]:
a = [("a", 2),("a", 2)]
print(", ".join(["({}, {})".format(i[0],i[1]) for i in a]))

(a, 2), (a, 2)


In [461]:
award_winners_intro

Unnamed: 0,intro,winner_name,count_words,count_sentences,count_paragraphs,common_words
0,Sir Timothy John Berners-Lee (born 8 June 195...,Tim Berners-Lee,359,17,4,"(the, 36), (of, 21), (and, 14), (He, 11), (a, ..."
1,"Yoshua Bengio (born March 5, 1964) is a Canad...",Yoshua Bengio,91,4,2,"(and, 6), (the, 5), (of, 4), (for, 3), (Bengio..."
2,Geoffrey Everest Hinton (born 6 December 1947...,Geoffrey Hinton,181,8,3,"(the, 12), (and, 8), (of, 7), (for, 5), (in, 5..."
3,Donald Ervin Knuth ( kə-NOOTH; born January 10...,Donald Knuth,184,8,3,"(the, 20), (of, 13), (and, 11), (Knuth, 5), (c..."
4,"Richard Manning Karp (born January 3, 1935) is...",Richard M. Karp,92,3,2,"(in, 6), (and, 5), (the, 5), (of, 5), (for, 3)..."
...,...,...,...,...,...,...
70,"Fernando José ""Corby"" Corbató (July 1, 1926 – ...",Fernando J. Corbató,28,1,1,"(a, 2), (Fernando, 1), (José, 1), (""Corby"", 1)..."
71,"Charles William Bachman III (December 11, 1924...",Charles Bachman,56,2,1,"(his, 3), (Bachman, 2), (was, 2), (an, 2), (in..."
72,"Butler W. Lampson, ForMemRS, (born December 23...",Butler Lampson,27,1,1,"(Butler, 1), (W., 1), (Lampson,, 1), (ForMemRS..."
73,Ole-Johan Dahl (12 October 1931 – 29 June 2002...,Ole-Johan Dahl,44,2,1,"(of, 4), (Dahl, 2), (was, 2), (a, 2), (compute..."


In [445]:
award_winners_intro[award_winners_intro["winner_name"] == "Joseph Sifakis"]

Unnamed: 0,intro,winner_name,count_words,count_sentences,count_paragraphs,common_words
68,Joseph Sifakis (Greek: Ιωσήφ Σηφάκης) is a Gre...,Joseph Sifakis,31,2,1,"[Joseph, Sifakis, (Greek:, Ιωσήφ, Σηφάκης), is..."


#### (c)

## 3.2 Sub-activity: Applying NLP operations on the corpus

### 3.2.1 Stemming

### Task 3

### Task 4

### 3.2.2 Lemmatization

### Task 5

### 3.2.3 Finding synonyms and antonyms

### Task 6

### 3.2.4 Bigrams and trigrams

### Task 7

### Task 8

### Task 9

### Task 10

## 3.3 Sub-section: Visualisation

### 3.3.1 Barplots

### Task 11

### Task 12

### Task 13

### 3.3.2 Heatmap

### Task 14