In [None]:
#Unzipping the data
!unzip data.zip

Archive:  data.zip
  inflating: training_data/training_set.csv  
  inflating: training_data/Evaluator/dummy_detection.csv  
  inflating: training_data/Evaluator/gold_detection.csv  
  inflating: training_data/README.pdf  
  inflating: training_data/Evaluator/dummy_resolution.csv  
  inflating: training_data/detection_answers_file.xlsx  
  inflating: training_data/Evaluator/gold_resolution.csv  
  inflating: training_data/Evaluator/evaluate.jar  
  inflating: training_data/disambiguation_answers_file.csv  
  inflating: training_data/detection_answers_file.csv  
  inflating: training_data/training_set.xlsx  
  inflating: training_data/disambiguation_answers_file.xlsx  


In [None]:
#Importing required libraries.
import pandas as pd
from bs4 import BeautifulSoup
import spacy
import re
import csv
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


In [None]:
#Loading training data and labels.
data = pd.read_excel("/content/training_data/training_set.xlsx")
labels = pd.read_excel("/content/training_data/detection_answers_file.xlsx")

In [None]:
#Encoding labels as 0 and 1.
labels["decision"] = labels.decision.eq("UNAMBIGUOUS", fill_value=1).mul(1)
#Merging training data with labels.
df = pd.merge(data,labels,on="sent_id")
sentences = []
pronouns = []
#Extracting candidate ambiguous pronouns and their starting and ending indices.
for sent in df.sent.values:
  occurences_1 = [(m.start(), m.end(), m.end() - m.start()) for m in re.finditer(r'<referential.{0,8}>', sent)]
  occurences_2 = [(m.start(), m.end(), m.end() - m.start()) for m in re.finditer('</referential>', sent)]
  occurences = zip(occurences_1[1:],occurences_2[1:])
  #Manually add first occurence to prevent redundant ',' occurences_2[0] - occurences_1[2]
  #Pronoun column in the data frame filled with values follows the pattern of PRONOUN#START_INDEX$END_INDEX.
  pronoun_to_append = str(sent[occurences_1[0][1]:occurences_2[0][0]]) + "#" + str(occurences_1[0][0]) + "$" + str(occurences_1[0][0] + (occurences_2[0][0] - occurences_1[0][1]))
  counter = occurences_1[0][2] + occurences_2[0][2]
  for occ in occurences:
    pronoun = "," + str(sent[occ[0][1]:occ[1][0]]) + "#" + str(occ[0][0] - counter) + "$" + str(occ[0][0] + (occ[1][0] - occ[0][1]) - counter)
    pronoun_to_append += pronoun
    counter += occ[0][2] + occ[1][2]
  pronouns.append(pronoun_to_append)
  #Using BeautifulSoup for parsing.
  soup = BeautifulSoup(sent, "html.parser")
  sentences.append(soup.get_text())

#Constructing data frame of training set.
df["category"] = df.apply(lambda row: row[0].split("#")[0], axis=1)
df["pronoun"] = pronouns
df["sent_id_in_category"] = df.apply(lambda row: int(row[0].split("#")[1]), axis=1)
df["sent"] = sentences

In [None]:
df

Unnamed: 0,sent_id,sent,decision,category,pronoun,sent_id_in_category
0,library#01,All material that is stored in the repository ...,0,library,it#57$59,1
1,library#02,The library may want to accept important digit...,1,library,them#114$118,2
2,library#03,"Once material has arrived, it must undergo sev...",1,library,it#27$29,3
3,library#04,Allows resources to be reviewed before a decis...,1,library,they#66$70,4
4,library#05,Allows metadata to be stored in a database in ...,1,library,their#110$115,5
...,...,...,...,...,...,...
125,space#16,Pipes may be opened by multiple receiving task...,0,space,"they#51$55,they#109$113",16
126,space#17,Communication services shall provide the capab...,0,space,their#86$91,17
127,space#18,"If the broadcast option is used, each virtual ...",0,space,it#87$89,18
128,space#19,The application software shall have the capabi...,0,space,its#92$95,19


In [None]:
#Defining singular and plural pronouns.
singular_suffixes = ["i", "you", "he", "she", "it", "me","him", "her", "my", "your", "his", "its", "mine", "yours" ,"hers", "myself", "yourself", "himself", "herself", "itself"]
plural_suffixes = ["we", "you", "they", "us", "them", "our", "your", "their", "ours", "yours", "theirs", "ourselves", "yourselves", "themselves"]

In [None]:
predictions = []
replaces = []
for sent, pronouns_with_indices in zip(df.sent.values, df.pronoun.values):
  doc = nlp(sent)
  print(sent)
  end_index = 0
  unambiguous = True
  print(pronouns_with_indices)

  for pronoun_splitted in pronouns_with_indices.split(','):
    #Retrieving the pronoun and its start and end indices.
    pronoun = pronoun_splitted.split('#')[0]
    indices = pronoun_splitted.split("#")[1]
    start_index = int(indices.split('$')[0])
    counter = 0
    chunks = []
    correct_chunks = []
    if pronoun in singular_suffixes:
      #If a given pronoun is a singular pronoun, following rules will be applied.
      for chunk in doc.noun_chunks:
        #Looping through for each noun chunk in the sentence.
        text = chunk.text
        idx = sent.find(text)
        print(text)
        #If we find a noun that is after the given pronoun, end the loop. 
        if not (idx < start_index):
          break
        #Otherwise, candidate noun chunks will be appended to chunks list.
        chunks.append(chunk)
        last_chunk = text.split(" ")[-1]
        #Forach token in the sentence
        for i, token in enumerate(doc):
            """
            Rule 1: The token equals to the last chunk in the sentence 
            and has the same starting index with chunk.
            """
          if token.text == last_chunk and token.idx == (chunk.end_char - len(token.text)):
            """
            Rule 2: Token has the tag of "NNP" or "NN".
            Token is not equal to "data" (denotes a plural noun since the singular form of data is "datum") 
            or contains "each of" phrase.
            """
            if token.tag_ == "NNP" or token.tag_ == "NN" or token.text == "data" or (i >= 2 and "each of" == (doc[chunk.start - 2].text + " " + doc[chunk.start - 1].text).lower()):
              if i != len(doc):
            """
            Rule 3: Token has not a following "prepositional phrase(PP)" and does not present after a given pronoun.
            """
                if doc[i+1].pos_ == "ADP" and sent[start_index:].find(token.text) != -1:
                  continue
            """
            Rule 4: Token has not a following "of" PP and a noun or pronoun.
            """                
                if i+1 != len(doc) and doc[i+1].text == "of" and (doc[i+2].pos_ == "NOUN" or doc[i+2].text == pronoun):
                  continue
            """
            Rule 5: Pronoun is not represent a gender and words that denotes pronouns with gender.
            """                   
                if (pronoun == "he" or pronoun == "him" or pronoun == "she" or pronoun == "her") and not (token.text == "driver" or token.text == "user" or token.text == "leader"):
                  continue

              print(chunk)
              counter += 1
              correct_chunks.append(chunk)
            break
    #If a given pronoun is a plural pronoun, following rules will be applied.
    if pronoun in plural_suffixes:
      #Looping through for each noun chunk in the sentence.
      for chunk in doc.noun_chunks:
        text = chunk.text
        idx = sent.find(text)
        print(text)
        #If we find a noun that is after the given pronoun, end the loop. 
        if not (idx < start_index):
          break
        #Otherwise, candidate noun chunks will be appended to chunks list.
        chunks.append(chunk)
        last_chunk = text.split(" ")[-1]
        #Forach token in the sentence
        for i, token in enumerate(doc):
            """
            Rule 1: The token equals to the last chunk in the sentence 
            and has the same starting index with chunk.
            """            
          if token.text == last_chunk and token.idx == (chunk.end_char - len(token.text)):
            """
            Rule 2: Token has the tag of "NNS" or
            Token contains special names as "NLM, ETCS and (s" or 
            Token is equal to "metadata, scheduler and user"
            """            
            if token.tag_ == "NNS" or token.text == "metadata" or token.text == "scheduler" or token.text == "user" or token.text[-2:] == "(s" or token.text == "NLM" or token.text == "ETCS":           
              if i != len(doc):
            """
            Rule 3: Token has not a following "ADP" and parts of token does not present after a given pronoun.
            """                
                if doc[i+1].pos_ == "ADP" and sent[start_index:].find(token.text[:-3]) != -1:
                  continue
            """
            Rule 4: Token has not a following "of" PP and a noun or pronoun.
            """                  
                if i+1 != len(doc) and doc[i+1].text == "of" and (doc[i+2].pos_ == "NOUN" or doc[i+2].text == pronoun):
                  continue
              print(chunk)
              counter += 1
              correct_chunks.append(chunk)

#If only chunk satisfies these rules, counter will be 1 that denotes unabiguous sentences.   
    print(counter)
    if counter != 1:
      unambiguous = False
      break
#Printing the pronoun with associated class.
    end_index = int(indices.split('$')[1])
  print(unambiguous)
#Asserting that unambiguous sentences only contains one correct resolution and classify them as unambiguous after assertion.
  if unambiguous:
    assert len(correct_chunks) == 1
    predictions.append(1)
  else:
    predictions.append(0)
#Appending resolution answers to a list.  
  replaces.append(correct_chunks)
  print(" ")

All material that is stored in the repository will enter it via the Ingest function.
it#57$59
All material
All material
the repository
the repository
it
the Ingest function
2
False
 
The library may want to accept important digital materials in non-standard formats in case we are able to migrate them to a more usable format in the future.
them#114$118
The library
important digital materials
important digital materials
non-standard formats
case
we
them
1
True
 
Once material has arrived, it must undergo several reviews, including virus checking, format compliance and anticipated content and file type.
it#27$29
material
material
it
1
True
 
Allows resources to be reviewed before a decision is made whether they should be retained.
they#66$70
Allows
resources
resources
a decision
they
1
True
 
Allows metadata to be stored in a database in a manner that conforms to repository reformatting and linked to their corresponding objects via an identifier.
their#110$115
Allows
metadata
metadata
a d

In [None]:
asd = 0
#Comarping predictions with correct annotations.
for idx, (pred, truth) in enumerate(zip(predictions, df.decision.values)):
  if pred == truth:
    asd += 1
  #Printing misclassified samples.
  else:
    print(idx)
    print(df.sent.values[idx])

#Printing total number of misclassifed samples and accuracy.
print(len(predictions) - asd)
print(asd / len(predictions))

10
The migration strategy must select a storage medium, taking into consideration the expected and actual rates of errors encountered in various media types, their performance, and their costs of ownership.
15
Audits data in SIPs or AIPs to ensure that they meet specified requirements.
16
The Activate Requests function maintains a record of event-driven requests and periodically compares it to the contents of the archive to determine if all needed data is available.
17
Maintains a record of event-driven events and compares it to the contents of the repository.
33
The Clarus system shall be able to communicate with environmental sensor stations through its collector using the NTCIP ESS 1204.
45
The logon screen shall activate command control for the user if the user requests it and has authorization.
46
When a device status has been overridden, on the screen it shall appear with different color from the normal and alarm status colors.
49
Any change in device state shall be reported on t

In [None]:
#Printing the resolution for each unambiguous sentence.
for repl, idx in zip(replaces, df.sent_id.values):
  if len(repl) == 1:
    print(idx + "-" + repl[0].text)

library#02-important digital materials
library#03-material
library#04-resources
library#05-metadata
library#07-Archival Storage functions
library#12-the requested AIP(s
library#16-SIPs
library#17-The Activate Requests function
library#19-changes
library#21-This function
weather#01-data
weather#02-The Clarus system
weather#03-RWIS databases
weather#05-new observation types
weather#07-CS
weather#08-environmental data
weather#09-The Clarus system
weather#12-new environmental data services
weather#13-The DOG
weather#14-observations
weather#15-The QEDC
weather#17-QChS
weather#18-The contributors
weather#19-The Clarus program
weather#20-contributors
railway#08-The devices
railway#10-the driver
railway#15-the driver
railway#18-A train
railway#19-the driver
railway#20-The ETCS trainborne equipment
railway#22-The train
railway#24-the text message facility
railway#25-fax functionality
railway#29-the driver
railway#38-leading drivers
railway#39-equipment
railway#41-the user
railway#42-functional 

### Evaluation

In [None]:
#Creating csv file to evaluate detection performance
df__ = df
df__["decision"] = predictions
df__["decision"].replace({0: "AMBIGUOUS", 1: "UNAMBIGUOUS"}, inplace=True)
df__[["sent_id","decision"]].to_csv('detection.csv',index=False, quotechar='"',quoting=csv.QUOTE_NONNUMERIC)

In [None]:
#Creating csv file to evaluate resolution performance 
idxs = []
resl = []
#Storing sentences that contains multiple disambiguous pronouns.
double = list(df[df['pronoun'].str.contains(",")]["sent_id"])
lst__ = ["a","b"]
for repl, idx in zip(replaces, df.sent_id.values):
  #For each unambiguous sentence
  if len(repl) == 1:
    if idx in double:
      #If a given sentence contains multiple disambiguous pronouns, we assume that resolution of both pronouns corresponds to the same noun.
      for letter in lst__ :
        idxs.append(idx+"-"+letter)
        resl.append(repl[0].text)
    else:
        # If a given sentence contains single disambiguous pronoun, append its resolution to the list.
        idxs.append(idx)
        resl.append(repl[0].text)    
        
#Creating a dataset with our resolving nouns, this data set will be referred as test set.
df_resolution = pd.DataFrame(data={"sent_id":idxs,"resolution":resl})
#Since we did not achieve %100 accuracy, we form our test set with ids that are present in the golden resolution set.
df_golden = pd.read_csv("/content/training_data/Evaluator/gold_resolution.csv")
#Printing misclassifed samples.
print("Not on golden records: ",set(list(df_resolution["sent_id"])).difference(df_golden["sent_id"]))
#Left joining golden set with test set.
df_resolution_final = df_golden.merge(df_resolution, how="left")
#Deleting duplicate columns.
df_resolution_final = df_resolution_final.drop(df_resolution_final.columns[1], axis=1)
#Filling the sentences that we did not predicted as ambiguous in detection with empty strings.
df_resolution_final = df_resolution_final.fillna(" ")
#Creating csv file to evaluate resolution performance of our system.
df_resolution_final.to_csv('resolution.csv',index=False, quotechar='"',quoting=csv.QUOTE_NONNUMERIC)

Not on golden records:  {'library#17', 'railway#42', 'railway#49', 'space#03', 'railway#25', 'weather#09', 'library#16'}


In [None]:
#Since we did not achieved %100 accuracy, we have created a hypotetical state that we have correctly classified all the samples
#to evaluate the performance of the rule-based pronoun resolver by using the samples only exists in our resolution answers.
df_golden_reduced = df_golden[df_golden.sent_id.isin(list(df_resolution_final[df_resolution_final["resolution"] != " "]["sent_id"]))]
df_golden_reduced.to_csv('golden_resolution_reduced.csv',index=False, quotechar='"',quoting=csv.QUOTE_NONNUMERIC)
df_resolution_final[df_resolution_final["resolution"] != " "].to_csv('resolution_reduced.csv',index=False, quotechar='"',quoting=csv.QUOTE_NONNUMERIC)