# Assignment 2: Information Extraction

In [1]:
import nltk
import re
import pandas as pd

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk import ne_chunk
from nltk.tree import Tree

#nltk.download('all')


---

### Use file `football_players.txt` to perform various information extraction tasks below.

In [2]:
# Download the text file (uncomment the line below in this cell, if not already downloaded from Blackboard)
!curl "https://ideone.com/plain/OvwDXZ" > football_players.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100 24172  100 24172    0     0  17118      0  0:00:01  0:00:01 --:--:-- 17118


 Read all the documents from `football_players.txt` into a list called `docs`.

In [3]:
docs = []
with open("football_players.txt", "r",encoding="utf-8") as G:
    for text in G:
        w = text.split('\n')
        docs.append(w)

## Task 2 
Function that takes a document and returns a list of sentences with part-of-speech tags.

In [4]:
def ie_preprocess(document):
  # your code goes here
    sent_list = []
    pos_list = []
    tagged_sent = []
    
    for sent in document:
        sent_list += sent_tokenize(sent)

    for line in sent_list:
        words = word_tokenize(line)
        pos_list = nltk.pos_tag(words)
        tagged_sent.append(pos_list)
    return tagged_sent    

In [5]:
first_doc = docs[0]
tagged_sentences = ie_preprocess(first_doc)
tagged_sentences[1]

[('He', 'PRP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('forward', 'NN'),
 ('and', 'CC'),
 ('serves', 'NNS'),
 ('as', 'IN'),
 ('captain', 'NN'),
 ('for', 'IN'),
 ('Portugal', 'NNP'),
 ('.', '.')]

## Task 3
Function that takes a list of tokens with POS tags for a sentence and returns a list of named entities (NE). 


In [6]:
def find_named_entities(sent):
    tree = nltk.ne_chunk(sent, binary=True)
    named_entities = []
    for subtree in tree.subtrees():
        if subtree.label() == 'NE':
            entity = ""
            for leaf in subtree.leaves():
                entity += leaf[0] + " "
            named_entities.append(entity.strip())
    return named_entities


In [7]:
tagged_sentences = ie_preprocess(docs[0])
find_named_entities(tagged_sentences[0])

['Cristiano Ronaldo',
 'Santos Aveiro',
 'ComM',
 'GOIH',
 'Portuguese',
 'Portuguese',
 'Spanish',
 'Real Madrid',
 'Portugal']

## Task 4

Implement the `find_all_named_entities` function below to find **all** NEs in a given document.


In [8]:
def find_all_named_entities(doc):
    named_entities = []
    named_list = []
    tagged_sentences = ie_preprocess(doc)
    for sent in tagged_sentences:
        named_list.append(find_named_entities(sent))
    for sublist in named_list:
        for item in sublist:
            named_entities.append(item)
    return named_entities   # return a flat list and not a list of lists

In [9]:
NE_Doc1 = find_all_named_entities(docs[0])
print("Named Entities in 1st document: ", len(NE_Doc1))

Named Entities in 1st document:  56


## Task 5

Find named entities across **all** documents in `football_players.txt`, and save the result into a single flat list.

In [10]:
all_named_entities = []
all_named_list=[]
for doc in docs:
    named_list = find_all_named_entities(doc)
    all_named_list.append(named_list)
for sublist in all_named_list:
    for item in sublist:
        all_named_entities.append(item)

In [11]:
print("Named Entities in all documents: ", len(all_named_entities))

Named Entities in all documents:  380


## Task 6

Functions to extract the name of the player, country of origin and date of birth as well as the following relations: team(s) of the player and position(s) of the player.


In [12]:
def name_of_the_player(doc):
    sent_list =[]
    name_exp = re.compile(r'^([^,(])+')    #Extract words until '(' first occurrs 
    
    for sent in doc:
        sent_list += sent_tokenize(sent)
    match = re.search(name_exp, sent_list[0])
    if match:                            # to check if search is successful or not
        name = match.group()
    
    return name

print("Doc 1: ",name_of_the_player(docs[0]))
print("Doc 2: ",name_of_the_player(docs[1]))
print("Doc 3: ",name_of_the_player(docs[2]))
print("Doc 4: ",name_of_the_player(docs[3]))
print("Doc 5: ",name_of_the_player(docs[4]))
print("Doc 6: ",name_of_the_player(docs[5]))
print("Doc 7: ",name_of_the_player(docs[6]))
print("Doc 8: ",name_of_the_player(docs[7]))
print("Doc 9: ",name_of_the_player(docs[8]))
print("Doc 10: ",name_of_the_player(docs[9]))

Doc 1:  Cristiano Ronaldo dos Santos Aveiro
Doc 2:  Lionel Andrés "Leo" Messi 
Doc 3:  Neymar da Silva Santos Júnior 
Doc 4:  Ronaldo de Assis Moreira 
Doc 5:  Wayne Mark Rooney 
Doc 6:  Zlatan Ibrahimović 
Doc 7:  David Robert Joseph Beckham
Doc 8:  Mesut Özil 
Doc 9:  Gareth Frank Bale 
Doc 10:  Andrés Iniesta Luján 


In [13]:
def country_of_origin(doc):
    NE_Doc = find_all_named_entities(doc)          #get all named entities in the doc passed to this function
    sent_list=[]
    pattern = "the\s(.*\w.*)\snational\steam"      #capture word between "the" and "national team"
    
    for sent in doc:
        sent_list += sent_tokenize(sent)

    for line in sent_list:
        words = word_tokenize(line)
        match = re.search(pattern,line)
        if match:
            country = match.group(1)
            if country in NE_Doc:                # checking if the word matched with the pattern is a named entity
                return country

print("Doc 1: ",country_of_origin(docs[0]))
print("Doc 2: ",country_of_origin(docs[1]))
print("Doc 3: ",country_of_origin(docs[2]))
print("Doc 4: ",country_of_origin(docs[3]))
print("Doc 5: ",country_of_origin(docs[4]))
print("Doc 6: ",country_of_origin(docs[5]))
print("Doc 7: ",country_of_origin(docs[6]))
print("Doc 8: ",country_of_origin(docs[7]))
print("Doc 9: ",country_of_origin(docs[8]))
print("Doc 10: ",country_of_origin(docs[9]))

Doc 1:  Portugal
Doc 2:  Argentina
Doc 3:  Brazil
Doc 4:  Brazil
Doc 5:  England
Doc 6:  Sweden
Doc 7:  England
Doc 8:  German
Doc 9:  Wales
Doc 10:  Spain


In [14]:
def date_of_birth(doc):
    pattern = "born\s(\d.*\w.*\d)\)"           #capture digits, word and digits present after the word "born"
    sent_list=[]
    
    for sent in doc:
        sent_list += sent_tokenize(sent)
    for line in sent_list:
        words = word_tokenize(line)
        match = re.search(pattern,line)
        if match:
            date = match.group(1) 
            return date
print("Doc 1: ",date_of_birth(docs[0]))
print("Doc 2: ",date_of_birth(docs[1]))
print("Doc 3: ",date_of_birth(docs[2]))
print("Doc 4: ",date_of_birth(docs[3]))
print("Doc 5: ",date_of_birth(docs[4]))
print("Doc 6: ",date_of_birth(docs[5]))
print("Doc 7: ",date_of_birth(docs[6]))
print("Doc 8: ",date_of_birth(docs[7]))
print("Doc 9: ",date_of_birth(docs[8]))
print("Doc 10: ",date_of_birth(docs[9]))

Doc 1:  5 February 1985
Doc 2:  24 June 1987
Doc 3:  5 February 1992
Doc 4:  21 March 1980
Doc 5:  24 October 1985
Doc 6:  3 October 1981
Doc 7:  2 May 1975
Doc 8:  15 October 1988
Doc 9:  16 July 1989
Doc 10:  11 May 1984


In [16]:
def team_of_the_player(doc):
    grammar =  r"""
        NBAR:
            {<NNP>+}  # Captures only nouns
      """
    chunker = nltk.RegexpParser(grammar)
    pattern = "(club|plays|played\sfor)\s(.*\w\s\w*|\w\snational\wteam\s\w\b)"  #captues words after "club", "plays", "played for"
    sent_list=[]
    team_list = []
    flag = 0
    
    for sent in doc:
        line = sent_tokenize(sent)
        for x in line:
            match = re.search(pattern,x)
            pos_list = []
            if match:
                team = match.group(2) 
                #print("LINE SELECTED: ",x)
                #print("TEAM: ",team)
                words = word_tokenize(team)
                #print("WORDS: ",words)
                pos_list = nltk.pos_tag(words)
                #print("POS LIST: ",pos_list)
                tree = chunker.parse(pos_list)          #check tagged sentence out of regex matched value with the grammer rule NBAR
                if(flag == 0):                          #flag kept to check only 1st sentence which will have team name
                    for subtree in tree.subtrees():
                        nbar_phrase = ""
                        if (subtree.label() == 'NBAR'):
                            for leaf in subtree.leaves():
                                nbar_phrase += leaf[0] + " "
                            team_list.append(nbar_phrase.strip())
                            flag = 1
                
            else:
                continue
        return team_list


In [17]:
print("Doc 1: ",team_of_the_player(docs[0]),"\n")
print("Doc 2: ",team_of_the_player(docs[1]),"\n")
print("Doc 3: ",team_of_the_player(docs[2]),"\n")
print("Doc 4: ",team_of_the_player(docs[3]),"\n")
print("Doc 5: ",team_of_the_player(docs[4]),"\n")
print("Doc 6: ",team_of_the_player(docs[5]),"\n")
print("Doc 7: ",team_of_the_player(docs[6]),"\n")
print("Doc 8: ",team_of_the_player(docs[7]),"\n")
print("Doc 9: ",team_of_the_player(docs[8]),"\n")
print("Doc 10: ",team_of_the_player(docs[9]),"\n")

Doc 1:  ['Real Madrid', 'Portugal'] 

Doc 2:  ['FC Barcelona', 'Argentina'] 

Doc 3:  ['FC Barcelona', 'Brazil'] 

Doc 4:  ['FC Barcelona', 'September'] 

Doc 5:  ['Manchester United', 'England'] 

Doc 6:  ['Manchester United'] 

Doc 7:  ['Manchester United', 'Preston North End', 'Real Madrid', 'Milan', 'LA Galaxy', 'Paris Saint-Germain', 'England', 'Wayne Rooney'] 

Doc 8:  ['English', 'Arsenal'] 

Doc 9:  ['Real Madrid', 'Wales'] 

Doc 10:  ['FC Barcelona', 'Spain'] 



In [18]:
def position_of_the_player(doc):
    grammar =  r"""
        NBAR:
            {<NN*>}                            # to find noun phrases
        NP:
            {<VBZ>?<IN>?<DT>*(<.*>)*<NBAR>*?}  # Above NBAR connected with verb, perposition or determinent

      """
    chunker = nltk.RegexpParser(grammar)
    pattern = "(who\splay|is\s|as\sa\s)(.*?\.)"  #capture word after "who play" or "is" or "as a" until the first full stop
    all_positions = ["midfielder","winger","forward","striker","goalkeeper","defender"] #list of all positions in a football team
    sent_list=[]
    post_list=[]
    for sent in doc:
        line = sent_tokenize(sent)
        for x in line:
            match = re.search(pattern,x, re.MULTILINE)
            pos_list = []
            if match:
                matched_pos = match.group(2) 
                if matched_pos:
                    words = word_tokenize(matched_pos)
                    pos_list = nltk.pos_tag(words)
                    tree = chunker.parse(pos_list)
                    for subtree in tree.subtrees():
                        if subtree.label() == 'NP':  #check if the matched string follows the grammer rules defined in NP
                            phrase = ""
                            for leaf in subtree.leaves():
                                phrase = phrase + leaf[0] + " "
                                temp = phrase.strip()
                                for x in all_positions:   #check if the word recevied from the after matching grammer is present in the defined set of positions 
                                    if x in temp:
                                        post_list.append(x)
                            
                
            else:
                continue  #to continue to next line if regex pattern is not matched in given line
        position = set(post_list) #removing duplicates from post_list
        return position

In [19]:
print("Doc 1: ",position_of_the_player(docs[0]),"\n")
print("Doc 2: ",position_of_the_player(docs[1]),"\n")
print("Doc 3: ",position_of_the_player(docs[2]),"\n")
print("Doc 4: ",position_of_the_player(docs[3]),"\n")
print("Doc 5: ",position_of_the_player(docs[4]),"\n")
print("Doc 6: ",position_of_the_player(docs[5]),"\n")
print("Doc 7: ",position_of_the_player(docs[6]),"\n")
print("Doc 8: ",position_of_the_player(docs[7]),"\n")
print("Doc 9: ",position_of_the_player(docs[8]),"\n")
print("Doc 10: ",position_of_the_player(docs[9]),"\n")

Doc 1:  {'forward'} 

Doc 2:  {'forward'} 

Doc 3:  {'forward'} 

Doc 4:  {'midfielder', 'forward'} 

Doc 5:  {'forward'} 

Doc 6:  {'striker'} 

Doc 7:  {'winger'} 

Doc 8:  {'midfielder'} 

Doc 9:  {'defender', 'winger'} 

Doc 10:  {'midfielder'} 



## Task 6
Identify and extract one other relation (besides team and player).

In [20]:
## This function is to extract various awards/titles won by players

# NOTE: No awards/title has been mentioned for Player in Doc 8 

def player_awards(doc):
    grammar =  r"""
        NBAR:
            {<NNP>+<NN>}             #captures all words between noun phrases 
            {<DT><NNP>+}             #captures all words noun phrases after determinants 
            {<NNP>+<IN><DT><NN>}     #captures all words after noun phrases and ends with preposition, determinant, noun
            {<JJ><NNP>+}             #captures all words starting with adjective and has one or more noun phrases 
      """
    chunker = nltk.RegexpParser(grammar)
    pattern = "(won|winning|win\s)(.*\w\s|\w\sin)(.*?\.)"  #captures words after "won", "win", "winning" containing the word "in" until the 1st full stop occurrs.
    NE_Doc = []
    sent_list=[]
    phrase_list = []
    NE_Doc += find_all_named_entities(doc)  # call to store all named entities in given document
    print()
    for sent in doc:
        line = sent_tokenize(sent)
        for x in line:
            match = re.search(pattern,x)
            pos_list = []
            if match:
                award = match.group(2)
                words = word_tokenize(award)
                pos_list = nltk.pos_tag(words)  #creates POS list of all words matched by the regex pattern
                tree = chunker.parse(pos_list)  #create tree of the POS list of words based on grammer rule
                for subtree in tree.subtrees():
                    if subtree.label() == 'NBAR': #check if the matched string follows the grammer rules defined in NBAR
                        phrase = ""
                        for leaf in subtree.leaves():
                            phrase = phrase + leaf[0] + " "
                        phrase_list.append(phrase.strip())
                        
                        
            else:
                continue
        award_list = set(phrase_list)     #converting to set to remove duplicates

        return award_list

In [22]:
print("Doc 1: ",player_awards(docs[0]),"\n")
print("Doc 2: ",player_awards(docs[1]),"\n")
print("Doc 3: ",player_awards(docs[2]),"\n")
print("Doc 4: ",player_awards(docs[3]),"\n")
print("Doc 5: ",player_awards(docs[4]),"\n")
print("Doc 6: ",player_awards(docs[5]),"\n")
print("Doc 7: ",player_awards(docs[6]),"\n")
print("Doc 8: ",player_awards(docs[7]),"\n")
print("Doc 9: ",player_awards(docs[8]),"\n")
print("Doc 10: ",player_awards(docs[9]),"\n")


Doc 1:  {'a Club World', 'successive Premier League', "FIFA Ballon d'Or", 'UEFA Champions League title', 'FIFA World Player of the Year', 'European Golden Shoe', 'La Liga title', 'a FIFA Club World', "Ballon d'Or", 'the FA Cup'} 


Doc 2:  {"FIFA Ballons d'Or", 'the Golden Ball', 'Copas del', "Ballon d'Or", 'Year award', 'European Golden', 'Olympic gold'} 


Doc 3:  {'the Copa', 'the Golden Ball', 'successive Campeonato Paulista', 'the UEFA Champions League', 'a Copa'} 


Doc 4:  {'Rivaldo in an attacking', 'the FIFA World Player', 'the FIFA World Cup All-Star', 'the UEFA Champions League', "Ballon d'Or", 'Year award', 'alongside Ronaldo'} 


Doc 5:  {'the Premier League', 'the Premier League Goal', 'Day poll', 'Season award', 'the Football League Cup', 'the England Player', 'the Month', 'the FIFA Club World Cup', 'the UEFA Champions League', 'the Goal', 'the FA Cup', 'the Premier League Player', 'the FA Community Shield', 'the Year', 'the BBC'} 


Doc 6:  {'consecutive Ligue', 'anoth