In [1]:
import pandas as pd
import spacy
from spacy import displacy
from string import punctuation

In [None]:
'''
flow
1) load data and spacy
2) run NER
3) identify nouns
4) order nouns ascdening by frequency
5) create list with search terms and preference per row
6) load redirects and article mapping
7) identify best article
8) get category tree
'''

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [3]:
t_data = pd.read_pickle('C:/Users/Fredi/Kodningsprojekt/data-analysis/workproduct-files/cleaned-dataframes/t_dataMaster-duplicatesRemoved.pkl')


In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
spacy.explain("obj")

'object'

In [82]:
text = "Which Egyptian queen was married to king Akhenaten?"

In [7]:
# https://spacy.io/usage/spacy-101
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

for token in doc:
    print(token.text, token.lemma_)

Processing process
raw raw
text text
intelligently intelligently
is be
difficult difficult
: :
most most
words word
are be
rare rare
, ,
and and
it -PRON-
’s ’
common common
for for
words word
that that
look look
completely completely
different different
to to
mean mean
almost almost
the the
same same
thing thing
. .


In [8]:
#https://spacy.io/api/annotation#pos-tagging

In [83]:
doc = nlp(text)
for token in doc:
    print(token.text, token.pos_, token.dep_, token.is_stop, token.lemma_)
displacy.render(doc, style="dep")

Which DET det True which
Egyptian ADJ amod False egyptian
queen NOUN nsubj False queen
was AUX ROOT True be
married ADJ acomp False married
to ADP prep True to
king NOUN pobj False king
Akhenaten PROPN pobj False Akhenaten
? PUNCT punct False ?


In [10]:
def namedEntities(a):
    doc = nlp(a)
    toReturn = []
    for ent in doc.ents:
        toReturn.append((ent.text, ent.label_))

    return toReturn

In [11]:
def getSubjects(a):
    doc = nlp(a.lower())
    toReturn = []
    dep_tag = ['csubj', 'nsubj']
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.dep_ in dep_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [12]:
def getObjects(a):
    doc = nlp(a.lower())
    toReturn = []
    dep_tag = ['iobj', 'obj', 'dobj', 'pobj']
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.dep_ in dep_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [75]:
def getNouns(a):
    #doc = nlp(a.lower())
    doc = nlp(a)
    toReturn = []
    pos_tag = ['NOUN']
    for token in doc:
        #toReturn.append([token.lemma_, token.pos_])
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            toReturn.append(token.lemma_)
    
    return toReturn

In [76]:
getNouns("Which composer wrote the classical piece Messiah?")

['composer', 'piece']

In [14]:
questions = pd.DataFrame(t_data.loc[2::100, "CONS_question"])

In [15]:
questions["namedEntities"] = questions["CONS_question"].apply(lambda x: namedEntities(x))

In [16]:
questions["subjects"] = questions["CONS_question"].apply(lambda x: getSubjects(x))

In [17]:
questions["objects"] = questions["CONS_question"].apply(lambda x: getObjects(x))

In [77]:
questions["nouns"] = questions["CONS_question"].apply(lambda x: getNouns(x))

In [19]:
questions[0:10]

Unnamed: 0,CONS_question,namedEntities,subjects,objects,nouns
2,"""The Diary of Anne Frank"" was first published in English under what title?","[(The Diary of Anne Frank, WORK_OF_ART), (first, ORDINAL), (English, LANGUAGE)]",[],"[frank, english, title]","[diary, title]"
102,"In one of Donald Horne's novels, which was ""the lucky country""?","[(one, CARDINAL), (Donald Horne's, PERSON)]",[],[novel],"[novel, country]"
202,What Dr Seuss character steals Christmas?,"[(Seuss, PERSON), (Christmas, DATE)]",[character],[christmas],[]
302,"Which famous book begins with the line ""The mole had been working very hard all the morning, spring-cleaning his little home""?","[(all the morning, TIME)]","[book, mole]",[line],"[book, line, mole, morning, spring, home]"
402,Who was Winnie the Pooh's neighbour?,"[(Winnie the Pooh's, PERSON)]",[],[],[neighbour]
502,As who is Terry Bollea known?,"[(Terry Bollea, PERSON)]",[],[],[]
602,"In the opera 'Don Giovanni', what was Leporello?","[(Don Giovanni', PERSON), (Leporello, PERSON)]",[],[giovanni],[leporello]
702,Secret Identities: Wally West,[],[],[],[identity]
802,What film featured a cat named Mr. Bigglesworth?,"[(Bigglesworth, PERSON)]",[film],[cat],"[film, cat]"
902,What was Kevin Bacon's first big hit?,"[(Kevin Bacon's, PERSON), (first, ORDINAL)]",[],[],[hit]


In [20]:
word_frequencies = pd.read_csv("F:/Word frequencies/unigram_freq.csv")

In [21]:
questions.iloc[0:2,-1]

2        [diary, title]
102    [novel, country]
Name: nouns, dtype: object

In [22]:
to_find1 = ["state", "cheyenne", "capital", "city"]
to_find = questions.iloc[1,-1]
word_frequencies[word_frequencies["word"].isin(to_find1)]

Unnamed: 0,word,count
110,state,453104133
138,city,390564835
1162,capital,68991999
16472,cheyenne,2224473


In [23]:
a = questions.iloc[0:2,-1].apply(lambda x: word_frequencies[word_frequencies["word"].isin(x)])
a

2                   word      count
377   title  196676017
4585  diary   14922472
102             word      count
395   country  188691168
3455    novel   21821189
Name: nouns, dtype: object

In [38]:
def frequencyCount(a):
    result = []
    for word in a:
        try:
            result.append(word_frequencies.loc[word_frequencies["word"] == word, "count"].iloc[0])
        except IndexError:
            result.append("noValueFound")
    return result    
    

In [78]:
%%time
questions["nounFrequencies"] = questions["nouns"].apply(lambda x: frequencyCount(x))

Wall time: 21 s


In [79]:
questions[470:500]

Unnamed: 0,CONS_question,namedEntities,subjects,objects,nouns,nounFrequencies
47002,"When found on a vehicle’s speedometer, what do the letters MPH stand for?","[(MPH, ORG)]",[mph],[vehicle],"[vehicle, speedometer, letter]","[54079229, 348124, 67339854]"
47102,Which planet in our solar system has an axis that is tilted by 98 degrees?,"[(98 degrees, QUANTITY)]",[planet],"[system, axis, degree]","[planet, system, axis, degree]","[26405032, 396975018, 12399723, 67029686]"
47202,“Being and Time” is an ontological treatise written by which German philosopher?,"[(Being and Time, WORK_OF_ART), (German, NORP)]",[],[philosopher],"[time, treatise, philosopher]","[908705570, 987068, 2718992]"
47302,"According to Guiness World Records, which author has the most published works?","[(Guiness World Records, ORG)]",[author],"[records, work]","[author, work]","[179813446, 419483948]"
47402,"In 2012, who was declared the fastest man alive?","[(2012, DATE)]",[man],[2012],[man],[181445531]
47502,Crusiverbalism refers to what activity?,[],[crusiverbalism],[activity],"[crusiverbalism, activity]","[noValueFound, 79633609]"
47602,"Who wrote the novel ""Across the River and into the Trees""?","[(Across the River, WORK_OF_ART), (Trees, GPE)]",[],"[novel, river, tree]",[novel],[21821189]
47702,"Which composer wrote the classical piece ""Messiah""?",[],[composer],[piece],"[composer, piece]","[8839743, 44298807]"
47802,"""One Giant Leap"" was the first biography ever written of whom?","[(One, CARDINAL), (first, ORDINAL)]",[leap],[],[biography],[23134870]
47902,"What band released the landmark album ""London Calling""?","[(London Calling, WORK_OF_ART)]",[band],"[album, call]","[band, landmark, album]","[68569061, 4645821, 77914316]"


In [51]:
values = questions.iloc[3,-1]
values.index(min(values))

2

In [25]:
test = frequencyCount(questions.iloc[1,-1])

In [31]:
questions.iloc[1,-1]

['novel', 'country']

In [26]:
test

[21821189, 188691168]

In [37]:
word_frequencies.loc[word_frequencies["word"] == "alskdugjasökdg", "count"].iloc[0]

IndexError: single positional indexer is out-of-bounds

In [28]:
word_frequencies.iloc[0,1]

23135851162

In [29]:
#t_data.CONS_category.unique()

In [30]:
#t_data.loc[22502]