In [114]:
import nltk
import string
from nltk import pos_tag
import spacy
import numpy as np
from nltk import grammar, parse
from collections import Counter 
from cleantext import clean
import re

In [115]:
nlp = spacy.load("en")

In [116]:
filename = '../data/test2.txt'
f = open(filename, "r")
contents = f.read()

clean("some input",
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # fully remove punctuation
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_digit="",
    replace_with_currency_symbol="",
    lang="en"                       # set to 'de' for German special handling
)

contents = re.sub(r'\d+', '', contents)
contents = re.sub(r"""
               [,;@#?!&$\"\'\[\]]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               " ",          # and replace it with a single space
               contents, flags=re.VERBOSE)
contents = re.sub(' +', ' ', contents)

In [117]:
utterances = []
split_utterances = []
sentences = contents.strip().split('.')
for sentence in sentences:
    if sentence != '':
        utterances.append(sentence.strip())

In [118]:
## pos tags
pos_tags_utterances = []

# for utterance in utterances:
#     pos_tags_utterances.append(pos_tag(utterance.split(' ')))

for utterance in utterances:
    doc = nlp(utterance)
    temp_list = []
    for token in doc:
        temp_list.append((token.text, token.tag_))
    pos_tags_utterances.append(temp_list)
print(pos_tags_utterances)

[[('John', 'NNP'), ('went', 'VBD'), ('to', 'IN'), ('his', 'PRP$'), ('favorite', 'JJ'), ('music', 'NN'), ('store', 'NN'), ('to', 'TO'), ('buy', 'VB'), ('a', 'DT'), ('piano', 'NN')], [('He', 'PRP'), ('was', 'VBD'), ('excited', 'JJ'), ('that', 'IN'), ('he', 'PRP'), ('could', 'MD'), ('finally', 'RB'), ('buy', 'VB'), ('a', 'DT'), ('piano', 'NN')], [('He', 'PRP'), ('arrived', 'VBD'), ('just', 'RB'), ('as', 'IN'), ('the', 'DT'), ('store', 'NN'), ('was', 'VBD'), ('closing', 'VBG'), ('for', 'IN'), ('the', 'DT'), ('day', 'NN')], [('It', 'PRP'), ('was', 'VBD'), ('closing', 'VBG'), ('just', 'RB'), ('as', 'IN'), ('John', 'NNP'), ('arrived', 'VBD')]]


In [119]:
# topic of first sentence
processed_utterance = nlp(utterances[0])
init_topic = ''
for token in processed_utterance:
    if "NN" in token.tag_:
        init_topic = token.text
        break

In [120]:
# Centers of 
total_utterances = len(utterances)
tagged_utterances = utterances

Cb = [None] * total_utterances

Cf = []
for i in range(total_utterances):
    Cf.append([])

Cb[0] = "undefined"
for i in range(total_utterances):
    for token in pos_tags_utterances[i]:
        if "NN" in token[1]:
            Cf[i].append((token[0].lower(), token[1]))
        if i!=0:
            if "PRP" == token[1]:
                if token[0].lower() == "he" or token[0].lower() == "she":
                    for pos in Cf[i-1]:
                        if pos[1] == "NNP":
                            if (pos[0].lower(), "NNP") not in Cf[i]:
                                Cf[i].append((pos[0].lower(), "NNP"))
                                break
                elif token[0].lower() == "it":
                    for pos in Cf[i-1]:
                        if pos[1] == "NN":
                            if (pos[0].lower(), "NN") not in Cf[i]:
                                Cf[i].append((pos[0].lower(), "NN"))
                                break
                elif token[0].lower() == 'they':
                    for pos in Cf[i-1]:
                        if pos[1] == "NNS":
                            if (pos[0].lower(), "NN") not in Cf[i]:
                                Cf[i].append((pos[0].lower(), "NNS"))
                                break

    if i != 0:
        try:
            Cb[i] = Cf[i][0]
        except:
            Cb[i] = Cb[i-1]
print(Cb)

['undefined', ('john', 'NNP'), ('john', 'NNP'), ('store', 'NN')]


In [121]:
topics_utterances = []
focus_utterances = []
topics_utterances.append(init_topic)
for tuple_val in Cb:
    if tuple_val != "undefined":
        if tuple_val[1] == "NNP":
            topics_utterances.append(tuple_val[0].capitalize())
        else:
            topics_utterances.append(tuple_val[0])
print(topics_utterances)

['John', 'John', 'John', 'store']


In [122]:
topic_dict = {}
for topic in topics_utterances:
    if len(topic) != 1:
        if topic.lower() not in topic_dict.keys():
            topic_dict[topic.lower()] = 1
        else:
            topic_dict[topic.lower()] += 1
        
k  = Counter(topic_dict)
if len(topic_dict)//2 < 3:
    top_topics_num = len(topic_dict)//2 + 1
else:
    top_topics_num = 3
top_topics_list = k.most_common(top_topics_num)


print("Topics present in the discourse are:")
for topic in top_topics_list:
    print(topic[0])

Topics present in the discourse are:
john
store


In [123]:
print("Topic of every sentence:")
for utterance, utterance_topic in zip(utterances, topics_utterances):
    print(utterance, ":", utterance_topic)

Topic of every sentence:
John went to his favorite music store to buy a piano : John
He was excited that he could finally buy a piano : John
He arrived just as the store was closing for the day : John
It was closing just as John arrived : store


In [124]:
i = 0
topic_sentence = []
focus_sentence = []
for utterance, utterance_topic in zip(utterances, topics_utterances):
    processed_utterance = nlp(utterance)
    topic_chunk = ''
    focus_chunk = ''
    for chunk in processed_utterance.noun_chunks:
        if chunk.root.dep_ == "nsubj":
            if utterance_topic in chunk.text:
                topic_chunk = chunk.text
                focus_chunk = utterance.lower().replace(topic_chunk.lower(), "")
                break
            elif "he" in chunk.text.lower():
                topic_chunk = Cb[i][0]
                focus_chunk = utterance.lower().replace("he", "")
                break
            elif "she" in chunk.text.lower():
                topic_chunk = Cb[i][0]
                focus_chunk = utterance.lower().replace("she", "")
                break
            elif "it" in chunk.text.lower():
                topic_chunk = Cb[i][0]
                focus_chunk = utterance.lower().replace("it", "")
                break
            elif "they" in chunk.text.lower():
                topic_chunk = Cb[i][0]
                focus_chunk = utterance.lower().replace("they", "")
                break
    i += 1
    topic_sentence.append(topic_chunk)
    focus_sentence.append(focus_chunk)

for utterance, topic, focus in zip(utterances, topic_sentence, focus_sentence):
    print(utterance, ": [", topic, ";", focus, "]")

John went to his favorite music store to buy a piano : [ John ;  went to his favorite music store to buy a piano ]
He was excited that he could finally buy a piano : [ john ;  was excited that  could finally buy a piano ]
He arrived just as the store was closing for the day : [ john ;  arrived just as t store was closing for t day ]
It was closing just as John arrived : [ store ;  was closing just as john arrived ]
