# Imports

In [1131]:
import re
import requests
import numpy as np
import pandas as pd
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer


# Activity 3

## 3.1 Sub-activity: Loading and pre-processing of text data

In [1132]:
# Global variables
WIKIDATA_API_ENDPOINT = "https://www.wikidata.org/w/api.php"

WIKIDATA_COMMON_PARAMS = {
    "ids":"",
    "props": "",
    "format":"json",
    "languages": "en",
    "formatversion": "2",
    "sitefilter": "enwiki",
    "action": "wbgetentities"
}

### Task 1

In [1133]:
def get_turing_award_recipients():

    wikidata_entity_params = {
        "srlimit":100,
        "format":"json",
        "list":"search",
        "action":"query",
        "formatversion": "2",
        "srprop":"sectiontitle",
        "srsearch": "haswbstatement:P166=Q185667",
    }
    
    wikidata_ID_response = requests.get(WIKIDATA_API_ENDPOINT, params = wikidata_entity_params)
    wikidata_ID_data = wikidata_ID_response.json()    
    wikidata_IDs = [entity_ID["title"] for entity_ID in wikidata_ID_data["query"]["search"]]

    return wikidata_IDs

In [1134]:
wikidata_IDs = get_turing_award_recipients()

### Task 2

In [1135]:
def get_wikipedia_content(wikidata_ID):
    wikipedia_API_endpoint = "https://en.wikipedia.org/w/api.php"
    
    WIKIDATA_COMMON_PARAMS["ids"] = wikidata_ID
    WIKIDATA_COMMON_PARAMS["props"] = "sitelinks"
    
    wikidata_response = requests.get(WIKIDATA_API_ENDPOINT, params = WIKIDATA_COMMON_PARAMS)
    wikidata_response_data = wikidata_response.json()

    # To extract content from the wikipidia page, we have to use titles gained the wikidata IDs, since the titles of wikipedia pages are unique.
    wikidata_title = wikidata_response_data["entities"][wikidata_ID]["sitelinks"]["enwiki"]["title"]    

    wikipedia_params = {
        "titles": "",
        "format": "json",
        "action": "query",
        "prop": "extracts",
        "formatversion": "2",
        "titles": wikidata_title,
        "exsectionformat": "wiki"
    }
    
    wikipedia_content_response = requests.get(wikipedia_API_endpoint, params = wikipedia_params)
    wikipedia_content_data = wikipedia_content_response.json()
    wikipedia_content = wikipedia_content_data["query"]["pages"][0]["extract"]
    return wikipedia_content


### Task 3

In [1136]:
def get_award_winners_info(wikidata_ID):
    
    WIKIDATA_COMMON_PARAMS["ids"] = wikidata_ID
    WIKIDATA_COMMON_PARAMS["props"] = "claims|sitelinks"

    wikidata_title_response = requests.get(WIKIDATA_API_ENDPOINT, params = WIKIDATA_COMMON_PARAMS)
    wikidata_title_data = wikidata_title_response.json()

    # Extract name
    try:
        wikidata_name = wikidata_title_data["entities"][wikidata_ID]["sitelinks"]["enwiki"]["title"]
    except KeyError:
        wikidata_name = None

    # Extract intro from wikipedia page
    try:
        wikipedia_content = get_wikipedia_content(wikidata_ID)
    except KeyError:
        wikipedia_intro = None
    else:
        content_remove_newline_to_space = wikipedia_content.replace("\n", " ")
        content_with_p_tag = re.sub(r"<\/?(?!p)\w*\b[^>]*>", "", content_remove_newline_to_space.split("<h2>")[0])
        paragraphs = re.findall(r'<p>(.+?)</p>', content_with_p_tag)
        wikipedia_intro = "\n".join(paragraphs)

    # Extract gender ID to get gender from "sex or gender (P21)"
    try:
        wikidata_gender_id = wikidata_title_data["entities"][wikidata_ID]["claims"]["P21"][0]["mainsnak"]["datavalue"]["value"]["id"]
    except KeyError:
        wikidata_gender_id = None
        
    # Get birth date from "date of birth (P569)"
    try:
        wikidata_birth_date = wikidata_title_data["entities"][wikidata_ID]["claims"]["P569"][0]["mainsnak"]["datavalue"]["value"]["time"].split("T")[0].split("+")[1]
    except KeyError:
        wikidata_birth_date = None

    # Extract birth place ID to get birth place from "place of birth (P19)"
    try:
        wikidata_birth_place = wikidata_title_data["entities"][wikidata_ID]["claims"]["P19"][0]["mainsnak"]["datavalue"]["value"]["id"]
    except KeyError:
        wikidata_birth_place = None

    # Extract employer ID to get employer from "employer (P108)"
    # employer ID is inside of "mainsnak" key
    try:
        wikidata_employer_mainsnaks = wikidata_title_data["entities"][wikidata_ID]["claims"]["P108"]
    except KeyError:
        wikidata_employers_IDs = None
    else:
        wikidata_employers_IDs = [wikidata_employer_ID["mainsnak"]["datavalue"]["value"]["id"] for wikidata_employer_ID in wikidata_employer_mainsnaks]
        
    # Extract educated at ID to get educated at from "educated at (P69)"
    # educated at ID is inside of "mainsnak" key
    try:
        wikidata_educated_at_mainsnaks = wikidata_title_data["entities"][wikidata_ID]["claims"]["P69"]
    except KeyError:
        wikidata_educated_at_IDs = None
    else:
        wikidata_educated_at_IDs = [wikidata_educated_at_ID["mainsnak"]["datavalue"]["value"]["id"] for wikidata_educated_at_ID in wikidata_educated_at_mainsnaks]

    # Join the IDs per person that we want to get info from into a list
    entity_info_request_IDs = [wikidata_gender_id, wikidata_birth_place, "|".join(wikidata_employers_IDs), "|".join(wikidata_educated_at_IDs)]
    
    # A list that contains all the info of the person
    
    entity_info = [wikidata_name, wikipedia_intro, wikidata_birth_date]

    WIKIDATA_COMMON_PARAMS["props"] = "labels"
    for i in range(len(entity_info_request_IDs)):
        entity_values = []
        
        WIKIDATA_COMMON_PARAMS["ids"] = entity_info_request_IDs[i]
        
        response = requests.get(WIKIDATA_API_ENDPOINT, params = WIKIDATA_COMMON_PARAMS)
        data = response.json()

        if i == 0 or i == 1:
            try:
                entity_value = data["entities"][entity_info_request_IDs[i]]["labels"]["en"]["value"]
            except KeyError:
                entity_info.append(None)
            else:
                entity_info.append(entity_value)

        else:
            try:
                entity_IDs = entity_info_request_IDs[i].split("|")
            except AttributeError:
                entity_IDs = []

            for entity_ID in entity_IDs:
                try:
                    entity_value = data["entities"][entity_ID]["labels"]["en"]["value"]
                except KeyError:
                    entity_values.append(None)
                else:
                    entity_values.append(entity_value)
            
            entity_info.append(entity_values)
    
    return entity_info[0], entity_info[1], entity_info[2], entity_info[3], entity_info[4], entity_info[5], entity_info[6]

In [1137]:
award_winners = {
    "name": [],
    "intro": [],
    "birth_date": [],
    "gender": [],
    "birth_place": [],
    "employer": [],
    "educated_at": []
}

for wikidata_ID in wikidata_IDs:
    wikidata_name, wikipedia_intro, wikidata_birth_date, wikidata_gender, wikidata_birth_place, wikidata_employer, wikidata_educated_at = get_award_winners_info(wikidata_ID)
    award_winners["name"].append(wikidata_name)
    award_winners["intro"].append(wikipedia_intro)
    award_winners["birth_date"].append(wikidata_birth_date)
    award_winners["gender"].append(wikidata_gender)
    award_winners["birth_place"].append(wikidata_birth_place)
    award_winners["employer"].append(wikidata_employer)
    award_winners["educated_at"].append(wikidata_educated_at)

In [1138]:
print(award_winners["intro"][0])

Sir Timothy John Berners-Lee  (born 8 June 1955), also known as TimBL, is an English computer scientist best known as the inventor of the World Wide Web. He is a Professorial Fellow of Computer Science at the University of Oxford and a professor at the Massachusetts Institute of Technology (MIT). Berners-Lee proposed an information management system on 12 March 1989, then implemented the first successful communication between a Hypertext Transfer Protocol (HTTP) client and server via the Internet in mid-November.
Berners-Lee is the director of the World Wide Web Consortium (W3C), which oversees the continued development of the Web. He co-founded (with his then wife-to-be Rosemary Leith) the World Wide Web Foundation. He is a senior researcher and holder of the 3Com founder's chair at the MIT Computer Science and Artificial Intelligence Laboratory (CSAIL). He is a director of the Web Science Research Initiative (WSRI) and a member of the advisory board of the MIT Center for Collective I

### Task 4

In [1139]:
print("The names of all award winners are (alphabetical order): \n\n{}.".format(", ".join(sorted(award_winners["name"]))))

The names of all award winners are (alphabetical order): 

Adi Shamir, Alan Kay, Alan Perlis, Alfred Aho, Allen Newell, Amir Pnueli, Andrew Yao, Barbara Liskov, Bob Kahn, Butler Lampson, Charles Bachman, Charles P. Thacker, Dana Scott, David Patterson (computer scientist), Dennis Ritchie, Donald Knuth, Douglas Engelbart, E. Allen Emerson, Edgar F. Codd, Edmund M. Clarke, Edsger W. Dijkstra, Edward Feigenbaum, Edwin Catmull, Fernando J. Corbató, Frances Allen, Fred Brooks, Geoffrey Hinton, Herbert A. Simon, Ivan Sutherland, Jack Dongarra, James H. Wilkinson, Jeffrey Ullman, Jim Gray (computer scientist), John Backus, John Cocke (computer scientist), John Hopcroft, John L. Hennessy, John McCarthy (computer scientist), Joseph Sifakis, Judea Pearl, Juris Hartmanis, Ken Thompson, Kenneth E. Iverson, Kristen Nygaard, Leonard Adleman, Leslie Lamport, Leslie Valiant, Manuel Blum, Martin Hellman, Marvin Minsky, Maurice Wilkes, Michael O. Rabin, Michael Stonebraker, Niklaus Wirth, Ole-Johan Dahl

### Task 5

#### (a)

In [1140]:
award_winners_intro = pd.DataFrame(columns = ["winner_name", "count_words", "count_sentences", "count_paragraphs", "common_words"])

#### (b)

In [1141]:
award_winners_intro["winner_name"] = award_winners["name"]
award_winners_intro["count_words"] = pd.DataFrame(award_winners["intro"]).apply(lambda x: x[0].split(), axis = 1).apply(lambda x: len(x))
award_winners_intro["count_sentences"] = pd.DataFrame(award_winners["intro"]).apply(lambda x: sent_tokenize(x[0]), axis = 1).apply(lambda x: len(x))
award_winners_intro["count_paragraphs"] = pd.DataFrame(award_winners["intro"]).apply(lambda x: x[0].split("\n"), axis = 1).apply(lambda x: len(x))
award_winners_intro["common_words"] = pd.DataFrame(award_winners["intro"]).apply(lambda x: FreqDist(x[0].split()).most_common(10), axis = 1).apply(lambda x: [i[0] for i in x]).apply(lambda x: ", ".join(x))
award_winners_intro

Unnamed: 0,winner_name,count_words,count_sentences,count_paragraphs,common_words
0,Tim Berners-Lee,359,17,4,"the, of, and, He, a, is, Web, as, World, Wide"
1,Yoshua Bengio,91,4,2,"and, the, of, for, Bengio, is, a, work, deep, ..."
2,Geoffrey Hinton,181,8,3,"the, and, of, for, in, Hinton, a, his, to, is"
3,Donald Knuth,184,8,3,"the, of, and, Knuth, computer, is, to, He, ana..."
4,Richard M. Karp,92,3,2,"in, and, the, of, for, Karp, is, computer, the..."
...,...,...,...,...,...
70,Fernando J. Corbató,28,1,1,"a, Fernando, José, ""Corby"", Corbató, (July, 1,..."
71,Charles Bachman,57,3,1,"his, Bachman, was, an, in, of, Charles, Willia..."
72,Butler Lampson,27,1,1,"Butler, W., Lampson,, ForMemRS,, (born, Decemb..."
73,Ole-Johan Dahl,44,2,1,"of, Dahl, was, a, computer, the, and, Ole-Joha..."


#### (c)

In [1142]:
en_stopwords = stopwords.words("english")
award_winners_intro_remove_stopwords = pd.DataFrame(award_winners["intro"]).apply(lambda x: " ".join([word for word in word_tokenize(x[0]) if word not in (en_stopwords)]), axis = 1)
award_winners_intro_remove_stopwords_and_punctuation = award_winners_intro_remove_stopwords.apply(lambda x: RegexpTokenizer(r'\w+').tokenize(x)).apply(lambda x: " ".join(x))
award_winners_intro["common_words_after_preprocessing"] = award_winners_intro_remove_stopwords_and_punctuation.apply(lambda x: FreqDist(x.split()).most_common(10)).apply(lambda x: [i[0] for i in x]).apply(lambda x: ", ".join(x))
award_winners_intro

Unnamed: 0,winner_name,count_words,count_sentences,count_paragraphs,common_words,common_words_after_preprocessing
0,Tim Berners-Lee,359,17,4,"the, of, and, He, a, is, Web, as, World, Wide","Web, He, World, Wide, Berners, Lee, s, Compute..."
1,Yoshua Bengio,91,4,2,"and, the, of, for, Bengio, is, a, work, deep, ...","Bengio, work, deep, learning, Learning, Hinton..."
2,Geoffrey Hinton,181,8,3,"the, and, of, for, in, Hinton, a, his, to, is","Hinton, computer, work, neural, networks, Goog..."
3,Donald Knuth,184,8,3,"the, of, and, Knuth, computer, is, to, He, ana...","Knuth, computer, He, science, analysis, algori..."
4,Richard M. Karp,92,3,2,"in, and, the, of, for, Karp, is, computer, the...","Karp, computer, theory, algorithms, Richard, M..."
...,...,...,...,...,...,...
70,Fernando J. Corbató,28,1,1,"a, Fernando, José, ""Corby"", Corbató, (July, 1,...","July, Fernando, José, Corby, Corbató, 1, 1926,..."
71,Charles Bachman,57,3,1,"his, Bachman, was, an, in, of, Charles, Willia...","Bachman, Charles, William, III, December, 11, ..."
72,Butler Lampson,27,1,1,"Butler, W., Lampson,, ForMemRS,, (born, Decemb...","Butler, W, Lampson, ForMemRS, born, December, ..."
73,Ole-Johan Dahl,44,2,1,"of, Dahl, was, a, computer, the, and, Ole-Joha...","Dahl, computer, Ole, Johan, 12, October, 1931,..."


#### (d)

In [1143]:
print(award_winners_intro.iloc[0:10,:])

        winner_name  count_words  count_sentences  count_paragraphs  \
0   Tim Berners-Lee          359               17                 4   
1     Yoshua Bengio           91                4                 2   
2   Geoffrey Hinton          181                8                 3   
3      Donald Knuth          184                8                 3   
4   Richard M. Karp           92                3                 2   
5     Robert Tarjan           62                3                 1   
6         Vint Cerf           65                2                 1   
7       Judea Pearl          156                5                 2   
8  Herbert A. Simon          181                7                 2   
9     Marvin Minsky           54                2                 2   

                                        common_words  \
0      the, of, and, He, a, is, Web, as, World, Wide   
1  and, the, of, for, Bengio, is, a, work, deep, ...   
2      the, and, of, for, in, Hinton, a, his, to, 

## 3.2 Sub-activity: Applying NLP operations on the corpus

### 3.2.1 Stemming

### Task 3

In [1144]:
for intro in award_winners["intro"]:
    tokens = word_tokenize(intro)
    words = [word for word in tokens if word not in en_stopwords]
    RegexpTokenizer(r'\w+').tokenize(words)
    
# porter = PorterStemmer()
# stemmed = [porter.stem(word) for word in filtered_words]
# print(stemmed)

TypeError: expected string or bytes-like object

### Task 4

### 3.2.2 Lemmatization

### Task 5

### 3.2.3 Finding synonyms and antonyms

### Task 6

### 3.2.4 Bigrams and trigrams

### Task 7

### Task 8

### Task 9

### Task 10

## 3.3 Sub-section: Visualisation

### 3.3.1 Barplots

### Task 11

### Task 12

### Task 13

### 3.3.2 Heatmap

### Task 14