In [23]:
# ADD
# Add subject and object (to identify nouns not otherwise identified)
# Remove duplicated searchTerms (keep first)

In [24]:
import pandas as pd
import spacy
from string import punctuation, digits
import re

In [25]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [26]:
# Question data
t_data = pd.read_pickle('C:/Users/Fredi/Kodningsprojekt/data-analysis/workproduct-files/cleaned-dataframes/t_dataMaster-duplicatesRemoved.pkl')

In [27]:
# word frequency data
word_frequencies = pd.read_csv("F:/Word frequencies/unigram_freq.csv")

In [28]:
nlp = spacy.load("en_core_web_sm")

#### Functions

In [29]:
def namedEntities(a):
    doc = nlp(a)
    toReturn = []
    
    #If input is title-case, convert to sentence
    if a.istitle():
        a = a.capitalize()
    
    for ent in doc.ents:
        toReturn.append((ent.text, ent.label_))

    return toReturn

In [30]:
def getNouns(a):
    
    #Identify substrings between quotes
    
    quotes1 = re.compile('"[^"]*"')
    quotes2 = re.compile("'[^']*'")
    quoteRanges = []
    toReturn = []
    
    for value in quotes1.finditer(a):
        quoteRanges.append(value.span())
        toReturn.append(value.group()[1:-1])
    for value in quotes2.finditer(a):
        quoteRanges.append(value.span())
        toReturn.append(value.group()[1:-1])
    
    #If input is title-case, convert to sentence
    if a.istitle():
        a = a.capitalize()
    
    doc = nlp(a)
    pos_tag = ['NOUN']
    
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        
        #Exclude words within quotes
        if any(start <= token.idx <= end for (start, end) in quoteRanges):
            continue
        
        if(token.pos_ in pos_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [31]:
def getSubjects(a):
    doc = nlp(a.lower())
    toReturn = []
    dep_tag = ['csubj', 'nsubj']
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.dep_ in dep_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [32]:
def getObjects(a):
    doc = nlp(a.lower())
    toReturn = []
    dep_tag = ['iobj', 'obj', 'dobj', 'pobj']
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.dep_ in dep_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [33]:
def frequencyCount(a):
    result = []
    for word in a:
        try:
            result.append([word, word_frequencies.loc[word_frequencies["word"] == word, "count"].iloc[0] ])
        except IndexError:
            # If no count is found, add 0 as value to ensure being sorted first in searchTerms()
            result.append([word, 0])
    return result    
    

In [34]:
# FUNCTION: Identify wikipedia redirect terms to look for (in priority order)
# 1. Check if named entities exist and add in order of priority (WORK_OF_ART, PERSON, FAC, LOC)
# 2. Add nouns in ascending frequency order (rarest first)
# 3. Remove "the" / "a" / "an" / quotes from beginning of named entities

# Takes a question row as input


def searchTerms(a):
    result = []
    
    for ent in a["namedEntities"]:
        if ent[1] == "WORK_OF_ART":
            result.append(ent[0])
    for ent in a["namedEntities"]:
        if ent[1] == "PERSON":
            result.append(ent[0])    
    for ent in a["namedEntities"]:
        if ent[1] == "FAC" or ent[1] == "LOC":
            result.append(ent[0])
    
    # sort nouns by nounFrequencies
    # https://www.kite.com/python/answers/how-to-sort-a-list-of-lists-by-an-index-of-each-inner-list-in-python
    # sorted_list = sorted(nested_list, key=lambda x: x[1])
    nounsSorted = sorted(a["nouns"], key=lambda x: x[1])
    for noun in nounsSorted:
        result.append(noun[0])
        
    # Clean-up, iterate over each value in result list          
    for i in range(len(result)):
        try:
            if result[i][:4].lower() == "the ":
                result[i] = result[i][4:]
            if result[i][:2].lower() == "a ":
                result[i] = result[i][2:]
            if result[i][:3].lower() == "an ":
                result[i] = result[i][3:]
            if result[i][0].lower() == "'" or result[i][0].lower() == '"':
                result[i] = result[i][1:]
            if result[i][-1].lower() == "'" or result[i][-1].lower() == '"':
                result[i] = result[i][:-1]
            if result[i][-2:].lower() == "'s":
                result[i] = result[i][:-2]
        except:
            continue
    return result

#### Identifying key words

In [10]:
questions = t_data

In [11]:
%%time
questions["namedEntities"] = questions["CONS_question"].apply(lambda x: namedEntities(x))

Wall time: 6min 43s


In [12]:
%%time
questions["nouns"] = questions["CONS_question"].apply(lambda x: getNouns(x))

Wall time: 7min 1s


In [13]:
%%time
# Add frequency count to nouns
questions["nouns"] = questions["nouns"].apply(lambda x: frequencyCount(x))

Wall time: 39min 47s


In [14]:
%%time
questions["searchTerms"] = questions.apply(searchTerms, axis = 1)

Wall time: 4.79 s


In [15]:
questions.to_pickle("../workproduct-files/t_dataMaster-keywordsIdentified.pkl")

In [20]:
questions[7::100][300:400]

Unnamed: 0,CONS_id,CONS_question,CONS_answer,CONS_alt answers,CONS_category,CONS_alt categories - NOT USED,CONS_type-formulation,CONS_type-multipleChoice,ORIG_id,ORIG_question,ORIG_answer,ORIG_alt answers,ORIG_category,ORIG_alt categories,ORIG_difficulty,ORIG_type,Source,Duplicate_removed,namedEntities,nouns,searchTerms
30007,tdb_0x007cdb,What is Ordune?,Arousal by nude pics,,Uncategorized,,Question,False,tdb_0x007cdb,What is Ordune,0,[Arousal by nude pics],UNCATEGORIZED,,,,tdb,,"[(Ordune, PERSON)]",[],[Ordune]
30107,tdb_0x007d44,What is the 1983 bryan adams album which features the hit 'cuts like a knife'?,Cuts like a knife,,Uncategorized,,Question,False,tdb_0x007d44,What is the 1983 bryan adams album which features the hit 'cuts like a knife',0,[Cuts like a knife],UNCATEGORIZED,,,,tdb,,"[(1983, DATE)]","[[cuts like a knife, 0], [hit, 53171478]]","[cuts like a knife, hit]"
30207,tdb_0x007db1,What is the bundestag?,German parliament,,Uncategorized,,Question,False,tdb_0x007db1,What is the bundestag,0,[German parliament],UNCATEGORIZED,,,,tdb,,[],"[[bundestag, 116336]]",[bundestag]
30307,tdb_0x007eb5,What is the capital of Sao Tome and Principe?,Sao tome,,Geography and places,,Question,False,tdb_0x007eb5,What is the Capital of: Sao Tome and Principe,0,[Sao tome],UNCATEGORIZED,,,,tdb,[rtg_0x000309],"[(Sao Tome, ORG)]","[[capital, 68991999]]",[capital]
30407,tdb_0x007f43,What is the commonest item traded internationally?,Petroleum and its by products,,Uncategorized,,Question,False,tdb_0x007f43,What is the commonest item traded internationally,0,[Petroleum and its by products],UNCATEGORIZED,,,,tdb,,[],"[[item, 296534935]]",[item]
30507,tdb_0x007fb0,What is the equivalent RAF rank to Sub-lieutenant RN and Lieutenant in the army?,Flying officer,,Uncategorized,,Question,False,tdb_0x007fb0,What is the equivalent RAF rank to Sub-lieutenant RN and Lieutenant in the army,0,[Flying officer],UNCATEGORIZED,,,,tdb,,[],"[[rank, 36408196], [lieutenant, 3946806], [army, 44670729]]","[lieutenant, rank, army]"
30607,tdb_0x008015,What is the fear of lawsuits known as?,Liticaphobia,,Uncategorized,,Question,False,tdb_0x008015,What is the fear of lawsuits known as,0,[Liticaphobia],UNCATEGORIZED,,,,tdb,,[],"[[fear, 28042145], [lawsuit, 6381221]]","[lawsuit, fear]"
30707,tdb_0x008079,What is the fear of teeth or dental surgery known as?,Odontophobia,,Uncategorized,,Question,False,tdb_0x008079,What is the fear of teeth or dental surgery known as,0,[Odontophobia],UNCATEGORIZED,,,,tdb,,[],"[[fear, 28042145], [tooth, 5551049], [surgery, 40026119]]","[tooth, fear, surgery]"
30807,tdb_0x0080df,What is the flower that stands for: argument?,Fig,,Uncategorized,,Question,False,tdb_0x0080df,What is the flower that stands for: argument,0,[Fig],UNCATEGORIZED,,,,tdb,,[],"[[flower, 33364539], [argument, 38902162]]","[flower, argument]"
30907,tdb_0x008143,What is the flower that stands for: poverty?,Evergreen clematis,,Uncategorized,,Question,False,tdb_0x008143,What is the flower that stands for: poverty,0,[Evergreen clematis],UNCATEGORIZED,,,,tdb,,[],"[[flower, 33364539], [poverty, 19430836]]","[poverty, flower]"


# TEST

In [37]:
text = "Who is the roman counterpart of aphrodite?"

In [38]:
getObjects(text)

['aphrodite']