# Key word identification

In this notebook we identify key words in sentences that are likely to correspond to the Wikipedia article(s) that are most relevant.

In [1]:
import pandas as pd
import spacy
from string import punctuation, digits
import re

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [3]:
# Question data
t_data = pd.read_pickle('C:/Users/Fredi/Kodningsprojekt/data-analysis/workproduct-files/cleaned-dataframes/t_dataMaster-duplicatesRemoved.pkl')

In [4]:
# word frequency data
word_frequencies = pd.read_csv("F:/Word frequencies/unigram_freq.csv")

In [5]:
nlp = spacy.load("en_core_web_sm")

#### Functions

In [6]:
def namedEntities(a):
    doc = nlp(a)
    toReturn = []
    
    #If input is title-case, convert to sentence
    if a.istitle():
        a = a.capitalize()
    
    for ent in doc.ents:
        toReturn.append((ent.text, ent.label_))

    return toReturn

In [7]:
def getNouns(a):
    
    #Identify substrings between quotes
    
    quotes1 = re.compile('"[^"]*"')
    quotes2 = re.compile("'[^']*'")
    quoteRanges = []
    toReturn = []
    
    for value in quotes1.finditer(a.replace("'s", "!s")):
        quoteRanges.append(value.span())
        toReturn.append(value.group().replace("!s", "'s")[1:-1])
    for value in quotes2.finditer(a.replace("'s", "!s")):
        quoteRanges.append(value.span())
        toReturn.append(value.group().replace("!s", "'s")[1:-1])
    
    #If input is title-case, convert to sentence
    if a.istitle():
        a = a.capitalize()
    
    doc = nlp(a)
    pos_tag = ['NOUN']
    
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        
        #Exclude words within quotes
        if any(start <= token.idx <= end for (start, end) in quoteRanges):
            continue
        
        if(token.pos_ in pos_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [8]:
def getSubjects(a):
    doc = nlp(a.lower())
    toReturn = []
    dep_tag = ['csubj', 'nsubj']
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.dep_ in dep_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [9]:
def getObjects(a):
    doc = nlp(a.lower())
    toReturn = []
    dep_tag = ['iobj', 'obj', 'dobj', 'pobj']
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.dep_ in dep_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [10]:
def frequencyCount(a):
    result = []
    for word in a:
        try:
            result.append([word, word_frequencies.loc[word_frequencies["word"] == word, "count"].iloc[0] ])
        except IndexError:
            # If no count is found, add 0 as value to ensure being sorted first in searchTerms()
            result.append([word, 0])
    return result    
    

In [11]:
# FUNCTION: Identify wikipedia redirect terms to look for (in priority order)
# 1. Check if named entities exist and add in order of priority (WORK_OF_ART, PERSON, FAC, LOC)
# 2. Add nouns in ascending frequency order (rarest first)
# 3. Remove "the" / "a" / "an" / quotes from beginning of named entities

# Takes a question row as input


def searchTerms(a):
    result = []
    
    for ent in a["namedEntities"]:
        if ent[1] == "WORK_OF_ART":
            result.append(ent[0])
    for ent in a["namedEntities"]:
        if ent[1] == "PERSON":
            result.append(ent[0])    
    for ent in a["namedEntities"]:
        if ent[1] == "FAC" or ent[1] == "LOC":
            result.append(ent[0])
    
    # Update to account for objects and subjects
    
    # sort nounsObjectsSubjects by nounFrequencies
    # https://www.kite.com/python/answers/how-to-sort-a-list-of-lists-by-an-index-of-each-inner-list-in-python
    # sorted_list = sorted(nested_list, key=lambda x: x[1])
    wordsSorted = sorted(a["nounsObjectsSubjects"], key=lambda x: x[1])
    for word in wordsSorted:
        result.append(word[0])
        
    # Clean-up, iterate over each value in result list          
    for i in range(len(result)):
        try:
            if result[i][:4].lower() == "the ":
                result[i] = result[i][4:]
            if result[i][:2].lower() == "a ":
                result[i] = result[i][2:]
            if result[i][:3].lower() == "an ":
                result[i] = result[i][3:]
            if result[i][0].lower() == "'" or result[i][0].lower() == '"':
                result[i] = result[i][1:]
            if result[i][-1].lower() == "'" or result[i][-1].lower() == '"':
                result[i] = result[i][:-1]
            if result[i][-2:].lower() == "'s":
                result[i] = result[i][:-2]
        except:
            continue
    return result

#### Identifying key words

In [12]:
questions = t_data

In [13]:
%%time
questions["namedEntities"] = questions["CONS_question"].apply(lambda x: namedEntities(x))

Wall time: 7min 1s


In [14]:
%%time
# Extract namedEntities from answer string
questions["namedEntities_answer"] = questions["CONS_answer"].apply(lambda x: namedEntities(x))

# Combine namedEntities from question and answer into same list
questions["namedEntities"] = questions["namedEntities"] + questions["namedEntities_answer"]

# Drop namedEntities_answer column
questions = questions.drop("namedEntities_answer", axis = 1)

Wall time: 6min 34s


In [15]:
%%time
questions["nouns"] = questions["CONS_question"].apply(lambda x: getNouns(x))

Wall time: 7min 40s


In [16]:
%%time
questions["objects"] = questions["CONS_question"].apply(lambda x: getObjects(x))

Wall time: 7min 14s


In [17]:
%%time
questions["subjects"] = questions["CONS_question"].apply(lambda x: getSubjects(x))

Wall time: 7min 46s


In [18]:
%%time
# Combine nouns, objects and subjects into same list
questions["nounsObjectsSubjects"] = questions["nouns"] + questions["objects"] + questions["subjects"]
# Remove duplicates
questions["nounsObjectsSubjects"] = questions["nounsObjectsSubjects"].apply(lambda x: list(dict.fromkeys(x)))

Wall time: 203 ms


In [19]:
%%time
# Add frequency count to nounsObjectsSubjects
questions["nounsObjectsSubjects"] = questions["nounsObjectsSubjects"].apply(lambda x: frequencyCount(x))

Wall time: 54min 55s


In [20]:
%%time
questions["searchTerms"] = questions.apply(searchTerms, axis = 1)
# Remove duplicates
questions["searchTerms"] = questions["searchTerms"].apply(lambda x: list(dict.fromkeys(x)))

Wall time: 5.04 s


In [21]:
questions.to_pickle("../workproduct-files/t_dataMaster-keywordsIdentified.pkl")