In [169]:
import nltk
import string
from nltk import pos_tag
import spacy
import numpy as np
from nltk import grammar, parse
from collections import Counter 
from cleantext import clean
import re

In [170]:
nlp = spacy.load("en")

In [171]:
# load and clean data
filename = '../data/test7.txt'
f = open(filename, "r")
contents = f.read()

clean("some input",
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # fully remove punctuation
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_digit="",
    replace_with_currency_symbol="",
    lang="en"                       # set to 'de' for German special handling
)

contents = re.sub(r'\d+', '', contents)
contents = re.sub(r"""
               [,;@#?!&$\"\'\[\]]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               " ",          # and replace it with a single space
               contents, flags=re.VERBOSE)
contents = re.sub(' +', ' ', contents)

utterances = []
split_utterances = []
sentences = contents.strip().split('.')
for sentence in sentences:
    if sentence != '':
        utterances.append(sentence.strip())

In [173]:
# POS tagging of each sentence using spacy
pos_tags_utterances = []

# for utterance in utterances:
#     pos_tags_utterances.append(pos_tag(utterance.split(' ')))

for utterance in utterances:
    doc = nlp(utterance)
    temp_list = []
    for token in doc:
        temp_list.append((token.text, token.tag_))
    pos_tags_utterances.append(temp_list)
print(pos_tags_utterances)

[[('Devendra', 'NNP'), ('Fadnavis', 'NNP'), ('whose', 'WP$'), ('government', 'NN'), ('is', 'VBZ'), ('fighting', 'VBG'), ('a', 'DT'), ('challenge', 'NN'), ('over', 'IN'), ('his', 'PRP$'), ('swearing', 'NN'), ('in', 'RP'), ('from', 'IN'), ('Sena', 'NNP'), ('-', 'HYPH'), ('NCP', 'NNP'), ('-', 'HYPH'), ('Cong', 'NNP'), ('-', 'HYPH'), ('combine', 'NNP'), ('got', 'VBD'), ('busy', 'JJ'), ('with', 'IN'), ('work', 'NN'), ('in', 'IN'), ('his', 'PRP$'), ('second', 'JJ'), ('stint', 'NN'), ('as', 'IN'), ('chief', 'JJ'), ('minister', 'NN'), ('of', 'IN'), ('Maharashtra', 'NNP'), ('by', 'IN'), ('signing', 'VBG'), ('his', 'PRP$'), ('first', 'JJ'), ('cheque', 'NN'), ('for', 'IN'), ('a', 'DT'), ('relief', 'NN'), ('fund', 'NN')], [('The', 'DT'), ('CMO', 'NNP'), ('Maharastra', 'NNP'), ('tweeted', 'VBD'), ('pictures', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('CM', 'NNP'), ('signing', 'VBG'), ('his', 'PRP$'), ('first', 'JJ'), ('cheque', 'NN'), ('for', 'IN'), ('the', 'DT'), ('Chief', 'NNP'), ('Minister', 'NNP'),

In [174]:
# topic of first sentence
processed_utterance = nlp(utterances[0])
init_topic = ''
for token in processed_utterance:
    if "NN" in token.tag_:
        init_topic = token.text
        break

In [175]:
# Centers of all utterances
total_utterances = len(utterances)
tagged_utterances = utterances

Cb = [None] * total_utterances

Cf = []
for i in range(total_utterances):
    Cf.append([])

Cb[0] = "undefined"
for i in range(total_utterances):
    for token in pos_tags_utterances[i]:
        if "NN" in token[1]:
            Cf[i].append((token[0].lower(), token[1]))
        if i!=0:
            if "PRP" == token[1]:
                if token[0].lower() == "he" or token[0].lower() == "she":
                    for pos in Cf[i-1]:
                        if pos[1] == "NNP":
                            if (pos[0].lower(), "NNP") not in Cf[i]:
                                Cf[i].append((pos[0].lower(), "NNP"))
                                break
                elif token[0].lower() == "it":
                    for pos in Cf[i-1]:
                        if pos[1] == "NN":
                            if (pos[0].lower(), "NN") not in Cf[i]:
                                Cf[i].append((pos[0].lower(), "NN"))
                                break
                elif token[0].lower() == 'they':
                    for pos in Cf[i-1]:
                        if pos[1] == "NNS":
                            if (pos[0].lower(), "NN") not in Cf[i]:
                                Cf[i].append((pos[0].lower(), "NNS"))
                                break

    if i != 0:
        try:
            Cb[i] = Cf[i][0]
        except:
            Cb[i] = Cb[i-1]
print(Cb)

['undefined', ('cmo', 'NNP'), ('minister', 'NN'), ('cm', 'NNP'), ('bjp', 'NNP'), ('tuesday', 'NNP'), ('centre', 'NNP'), ('bench', 'NN'), ('bjp', 'NNP'), ('sena', 'NNP'), ('advocate', 'NN'), ('mumbai', 'NNP'), ('drama', 'NN'), ('congress', 'NNP'), ('congress', 'NNP'), ('week', 'NN'), ('india', 'NNP'), ('india', 'NNP'), ('match', 'NN'), ('match', 'NN'), ('match', 'NN'), ('eyebrows', 'NNS'), ('anomaly', 'NNP'), ('format', 'NN'), ('series', 'NN'), ('world', 'NNP'), ('captain', 'NN'), ('format', 'NN'), ('cricket', 'NN')]


In [176]:
topics_utterances = []
focus_utterances = []
topics_utterances.append(init_topic)
for tuple_val in Cb:
    if tuple_val != "undefined":
        if tuple_val[1] == "NNP":
            topics_utterances.append(tuple_val[0].capitalize())
        else:
            topics_utterances.append(tuple_val[0])
print(topics_utterances)

['Devendra', 'Cmo', 'minister', 'Cm', 'Bjp', 'Tuesday', 'Centre', 'bench', 'Bjp', 'Sena', 'advocate', 'Mumbai', 'drama', 'Congress', 'Congress', 'week', 'India', 'India', 'match', 'match', 'match', 'eyebrows', 'Anomaly', 'format', 'series', 'World', 'captain', 'format', 'cricket']


In [177]:
topic_dict = {}
for topic in topics_utterances:
    if len(topic) != 1:
        if topic.lower() not in topic_dict.keys():
            topic_dict[topic.lower()] = 1
        else:
            topic_dict[topic.lower()] += 1
        
k  = Counter(topic_dict)
if len(topic_dict)//2 < 3:
    top_topics_num = len(topic_dict)//2 + 1
else:
    top_topics_num = 3
top_topics_list = k.most_common(top_topics_num)


print("Topics present in the discourse are:")
for topic in top_topics_list:
    print(topic[0])

Topics present in the discourse are:
match
bjp
congress


In [178]:
print("Topic of every sentence:")
for utterance, utterance_topic in zip(utterances, topics_utterances):
    print(utterance, ":", utterance_topic)

Topic of every sentence:
Devendra Fadnavis whose government is fighting a challenge over his swearing in from Sena-NCP-Cong-combine got busy with work in his second stint as chief minister of Maharashtra by signing his first cheque for a relief fund : Devendra
The CMO Maharastra tweeted pictures of the CM signing his first cheque for the Chief Minister’s Relief Fund : Cmo
The new chief minister was then seen handing over the cheque to a woman : minister
“CM Devendra Fadnavis’ first signature of this tenure was done on a CMReliefFund cheque on reaching Mantralaya which was handed over to Kusum Vengurlekar by CM ” the Chief Minister’s Office tweeted : Cm
The BJP chief minister got a breather as the Supreme Court on Monday said it will pass an order at : Bjp
am on Tuesday on the legality of the chief minister’s swearing in on Saturday which the opposition alliance allege was conducted in a “hurried and makeshift” ceremony : Tuesday
The Centre told a bench comprising Justices N V Ramana As

In [179]:
i = 0
topic_sentence = []
focus_sentence = []
for utterance, utterance_topic in zip(utterances, topics_utterances):
    processed_utterance = nlp(utterance)
    topic_chunk = ''
    focus_chunk = ''
    for chunk in processed_utterance.noun_chunks:
        if chunk.root.dep_ == "nsubj":
            if utterance_topic in chunk.text:
                topic_chunk = chunk.text
                focus_chunk = utterance.lower().replace(topic_chunk.lower(), "")
                break
            elif "he" in chunk.text.lower():
                topic_chunk = Cb[i][0]
                focus_chunk = utterance.lower().replace("he", "")
                break
            elif "she" in chunk.text.lower():
                topic_chunk = Cb[i][0]
                focus_chunk = utterance.lower().replace("she", "")
                break
            elif "it" in chunk.text.lower():
                topic_chunk = Cb[i][0]
                focus_chunk = utterance.lower().replace("it", "")
                break
            elif "they" in chunk.text.lower():
                topic_chunk = Cb[i][0]
                focus_chunk = utterance.lower().replace("they", "")
                break
    i += 1
    topic_sentence.append(topic_chunk)
    focus_sentence.append(focus_chunk)

for utterance, topic, focus in zip(utterances, topic_sentence, focus_sentence):
    print(utterance, ": [", topic, ";", focus, "]")

Devendra Fadnavis whose government is fighting a challenge over his swearing in from Sena-NCP-Cong-combine got busy with work in his second stint as chief minister of Maharashtra by signing his first cheque for a relief fund : [ Devendra Fadnavis ;  whose government is fighting a challenge over his swearing in from sena-ncp-cong-combine got busy with work in his second stint as chief minister of maharashtra by signing his first cheque for a relief fund ]
The CMO Maharastra tweeted pictures of the CM signing his first cheque for the Chief Minister’s Relief Fund : [ cmo ; t cmo maharastra tweeted pictures of t cm signing his first cque for t chief minister’s relief fund ]
The new chief minister was then seen handing over the cheque to a woman : [  ;  ]
“CM Devendra Fadnavis’ first signature of this tenure was done on a CMReliefFund cheque on reaching Mantralaya which was handed over to Kusum Vengurlekar by CM ” the Chief Minister’s Office tweeted : [ cm ; “cm devendra fadnavis’ first sig