In [1]:
from tkinter import * 
from tkinter.ttk import *
import pandas as pd
import numpy as np
from tkinter import scrolledtext
import nltk
import string
import math
import json
import os
import sys
import datetime
import random
from collections import Counter
from numpy import dot
from nltk.corpus import sentiwordnet as swn
from nltk import word_tokenize, pos_tag

In [2]:
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP.YO\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\HP.YO\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [3]:
dataset = 'small_dataset'
ROOT_PATH = os.path.abspath(sys.path[0])
DATA_PATH = os.path.join(ROOT_PATH,dataset)

In [4]:
def read_words(filename):
    with open(filename) as f:
        return [word for line in f for word in line.split()]

In [5]:
stop_words = read_words(os.path.join(ROOT_PATH,'stopwords.txt'))
english_words = read_words(os.path.join(ROOT_PATH,'english.txt'))
adj = read_words('adjectives.txt')
pwords = ["good","nice","excellent","positive","fortunate","correct","superior"]
nwords = ["bad","nasty","poor","negative","unfortunate","wrong","inferior"]

In [6]:
def string_to_time(dataframe):
    for i in range(dataframe.shape[0]):
        dataframe.at[i,'Date'] = datetime.datetime.strptime(dataframe['Date'][i],'%B %d, %Y')
    return dataframe

In [10]:
def json_to_dataframe(filename):
    with open(filename) as data:
        jarr = json.load(data)
    dataframe = pd.DataFrame(jarr['Reviews'])
    if(dataframe.shape[1] != 7):
        return pd.DataFrame(columns = ['Key','Username','UserLocation','ReviewText','Date','ReviewRating','ReviewID','ReviewTitle'])
    dataframe.columns = ['Username','UserLocation','ReviewText','Date','ReviewRating','ReviewID','ReviewTitle']
    dataframe['Key'] = jarr['HotelInfo']['HotelID']
    cols = dataframe.columns.tolist()
    cols = cols[-1:] + cols[:-1]# Bring Key to 1st position
    dataframe = dataframe[cols]
    return dataframe
#json_to_dataframe('./small_dataset/72572.json')

In [11]:
def create_dataframe():
    main_df = pd.DataFrame(columns = ['Key','Username','UserLocation','ReviewText','Date','ReviewRating','ReviewID','ReviewTitle'])
    for filename in os.listdir(DATA_PATH):
        filepath = os.path.join(DATA_PATH,filename)
        temp_df = json_to_dataframe(filepath)
        main_df = pd.concat([main_df,temp_df])
    main_df=main_df.reset_index()
    main_df.columns = ['Index','Key','Username','UserLocation','ReviewText','Date','ReviewRating','ReviewID','ReviewTitle']
    s = []
    for index,row in main_df.iterrows():
        sentences = [x for x in row['ReviewText'].split('.') if x != '' and x != ' ']
        s.append(sentences)
    main_df['Sentences']=s
    return string_to_time(main_df)

In [12]:
def av_rating(dataframe):
    av_rate = dict()
    author_list = []
    for index,row in dataframe.iterrows():
        if row['Username'] not in author_list:
            author_list.append(row['Username'])
        if row['Key'] in av_rate.keys():#row key is hotel id
            c = av_rate[row['Key']][0]*av_rate[row['Key']][1] #getting original rating sum ie mean*count =sum
            c = c + float(row['ReviewRating']['Overall'])#adding another rating
            c = c/(av_rate[row['Key']][1]+1)#getting new average
            av_rate[row['Key']] = tuple([c,av_rate[row['Key']][1] + 1])#value=tuple of rating and no of ratings
        else:
            av_rate[row['Key']] = tuple([float(row['ReviewRating']['Overall']),1])
    return (av_rate,author_list)#average rating of hotel along with total ratings and a list of authors

def author_credibility(dataframe):#Definition 1
    cred = dict()
    credibility = dict()
    (av_rate,author_list) = av_rating(dataframe)
    for author in author_list:
        cred[author] = tuple([0,0])#first number denotes the credibility of author and second number denotes total number of ratings given by author 
    for index,row in dataframe.iterrows():
        c = cred[row['Username']][0] + abs((float(row['ReviewRating']['Overall']) - av_rate[row['Key']][0]))/5
        #take credibility for existing an author and add to it (use av_rate with key and divide by 5)
        cred[row['Username']]=tuple([c,cred[row['Username']][1]+1])#increasing the number of ratings given by author
    for i in cred.keys():
        credibility[i] = 1-(cred[i][0]/cred[i][1])#divide the summation
    return credibility

def review_recency(dataframe):#Definition 5
    #returns recency list for all the hotels present in dataframe
    #which contains the recency score according to the formula for each review
    i = 0
    query_time = datetime.datetime.now()
    recency_list = np.array([])
    for hotel_id in pd.unique(dataframe['Key']):
        max_date = dataframe['Date'][i]#1st element is most recent
        date_list = list()
        while(dataframe['Key'][i] == hotel_id):
            date_list.append(dataframe['Date'][i])
            i = i+1
            if(i == dataframe.shape[0]):
                break
        min_date = dataframe['Date'][i-1]#last element is least recent
        dm = max_date - min_date #Time interval between first and final review for a hotel
        date_list = [query_time-x for x in date_list]
        date_list = np.array([np.exp(-x/dm) for x in date_list])
        recency_list = np.append(recency_list,date_list)#appending date list for each hotel
    return recency_list

In [13]:
def review_sentence_score(dataframe,weights):#Definition 6
    listOfIndicators = read_words(os.path.join(ROOT_PATH,'indicator_phrases.txt'))#list of indicator words
    #calculating css
    calc_css = lambda css_scores: sum([a*b for a,b in zip(css_scores,weights)])#multiply corresponding elements and then compute sum
    s = []#finallly it'll be list of list
    for index,row in dataframe.iterrows():
        sentences = row['Sentences']
        max_words = max(len(sent.split()) for sent in sentences)
        m = dict()
        for k in sentences:
            if k==row['ReviewTitle']:
                m[k] = [1,0,0]#makes LOC=1
            else:
                m[k] = [0,0,0]
            for i in listOfIndicators:
                if i in k:
                    m[k][1] = 1 # if it matches any indicator phrase then set it to 1
                    break
            m[k][2] = len(k.split())/max_words #ratio of number of words to the maximum number of words in a sentence
        m[sentences[0]][0] = 1 #LOC=1 for the first sentence of the review
        css = [tuple([m_val,calc_css(m[m_val])]) for m_val in m]#m_val is a sentencce
        css = dict(css)
        #print(css)
        s.append([css])
    return pd.DataFrame(s, columns = ['CSS'])

In [14]:
def cosine_similarity(sent1,sent2):
    #sent1 is tagged sentence
    words1 = Counter([x for x,_ in sent1])
    words2 = Counter([x for x,_ in sent2])
    intersect = set(words1.keys()) & set(words2.keys())
    dot_product = sum([words1[x]*words2[x] for x in intersect])
    prod = sum([words1[x]**2 for x in words1.keys()]) * sum([words2[x]**2 for x in words2.keys()])
    if prod == 0:
        return 0
    return math.sqrt((dot_product**2)/prod)

In [15]:
def get_sdf(dataframe):
    sent_single_list = []
    for i in range(dataframe.shape[0]):
        sent_single_list.extend([[a,i,b] for a,b in enumerate(dataframe['Sentences'][i])])
    sdataframe = pd.DataFrame(sent_single_list, columns = ['SentenceIndex','ReviewIndex','Sentence'])
    return sdataframe

In [16]:
def remove_stopwords(text):
    for w in stop_words:
        text = list(filter(lambda x: x != w, text))
    return text

def remove_space_and_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    stripped=text.translate(translator)
    stripped = remove_stopwords(stripped.split())
    return stripped
#print(remove_space_and_punctuation('This, is a bad: thing.'))

is_noun = lambda pos: pos[:2] == 'NN'
is_adjective = lambda pos: pos[:2] == 'JJ' or pos[:2] == 'JJR' or pos[:2] == 'JJS'
is_adverb = lambda pos: pos[:2] == 'RB' or pos[:2] == 'RBR' or pos[:2] == 'RBS'
is_verb = lambda pos: pos[:2] == 'VB'

def pos_tag(stripped):
    return [(w, p) for (w, p) in nltk.pos_tag(stripped) if is_noun(p) or is_adjective(p) or is_adverb(p) or is_verb(p)] 

def clean_sentence(text):
    stripped = remove_space_and_punctuation(text)
    return pos_tag(stripped)

def clean_sentences_list(sdataframe):
    return [remove_space_and_punctuation(x) for x in sdataframe['Sentence']]

def clean_sentences_dataframe(sdataframe):
    sdataframe['Cleaned'] = [clean_sentence(x) for x in sdataframe['Sentence']]
    sdataframe = sdataframe.reset_index()
    sdataframe.drop('SentenceIndex', axis=1, inplace=True)
    sdataframe.columns = ['SentenceIndex','ReviewIndex','Sentence','Cleaned']
    return sdataframe

def add_css(dataframe, sdataframe, cssdataframe):#Definition 7
    #modifies the review sentence score to accomodate the review recency and author credibility
    css_list = []
    recency_list = review_recency(dataframe)#recency_list contains review recency score for all the hotels and for each review
    cred_dict = author_credibility(dataframe)
    cssdataframe=cssdataframe['CSS']
    for _,x in sdataframe.iterrows():#ignoring index
        css_list.append((cssdataframe[x['ReviewIndex']][x['Sentence']])*((recency_list[x['ReviewIndex']])+cred_dict[dataframe['Username'][x['ReviewIndex']]])/2)
        #calculating sentence importance using author credibility and review recency.
        #didn't have the upvotes for a review
    sdataframe['CSS'] = css_list
    return sdataframe

In [17]:
def get_pos_tag(tag):
    if tag.startswith('N'):
        return 'n'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('J'):
        return 'a'
    elif tag.startswith('R'):
        return 'r'
    else:
        return None

In [18]:
# This is using sentiwordnet
# http://www.nltk.org/howto/sentiwordnet.html
def get_senti_synsets(w,tag):
    tag = get_pos_tag(tag)
    if tag is None:
        return None
    else:
        senti_synset = list(swn.senti_synsets(w,tag))#synonym set
        if not senti_synset:
            return None
        return senti_synset[0]
    
def sentence_sentiment(sent1):
    #passing as an argument already tagged sentences with pos tagging
    syn1 = [get_senti_synsets(w,p) for (w,p) in sent1]
    #print(syn1)
    syn1 = [s for s in syn1 if s]
    count = len(sent1)
    
    if count == 0:
        return 0
    
    score = 0.0
    for s in syn1:
        if s.neg_score() > s.pos_score():
            score = score - s.neg_score()
        else:
            score = score + s.pos_score()
    print("senti:")
    print(score/count)
    return score/count

def senti_sim(sdf,ls1):
    senti=[]
    clean=sdf['Cleaned'].tolist()
    for i in range(len(clean)):
        senti.append(sentence_sentiment(sdf.iloc[i]['Cleaned']))
    ls=[]
    clean=sdf['Cleaned'].tolist()
    for i in range(len(clean)):
        ans=[]
        for j in range(len(clean)):
            ans.append(ls1[i][j]*abs(senti[i]+senti[j]))
        ls.append(ans)
    return ls

In [19]:
def randomise_medoids(k,n):
    #print('random'+str(k))
    return random.sample(range(n),k)

def assign_point_to_cluster(sdataframe,sentence_index,cluster,ls):
    #assigns a sentence on the basis of similarity to a cluster
    maximum = 0
    assigned_cluster = -1
    for point in cluster:
        sim = ls[sentence_index][point[0]]#getting similarity of sentence with the other cluster heads
        if maximum<=sim:
            maximum=sim
            assigned_cluster = point
    for point in cluster:
        if point[0]==assigned_cluster[0]:
            point[1].append(sentence_index)
            break
    return cluster

def make_clusters(sdataframe,medoids,ls):
    cluster = [[x,[]] for x in medoids]
    for i,row in sdataframe.iterrows():
        cluster = assign_point_to_cluster(sdataframe,i,cluster,ls)#assigns a sentence to suitable cluster
    return cluster

def find_similarity(sdf,similarity):
    ls=[]
    clean=sdf['Cleaned'].tolist()
    for i in range(len(clean)):
        ans=[]
        for j in range(len(clean)):
            ans.append(similarity(clean[i],clean[j]))
        ls.append(ans)
    return ls

    
def find_new_medoid_for_single_cluster(single_cluster,sdataframe,ls):
    curr_sum=0
    new_medoid = single_cluster[0]
    for i in single_cluster[1]:#finding similarity with current cluster head
        curr_sum = curr_sum + ls[single_cluster[0]][i]
    for i in single_cluster[1]:#finding similarity with other points in cluster for possible new head of the cluster
        nsum = 0
        for j in single_cluster[1]:
            nsum = nsum + ls[i][j]
        if nsum > curr_sum:
            curr_sum = nsum
            new_medoid = i
    return new_medoid

def find_new_medoids(cluster,sdataframe,ls):
    return [find_new_medoid_for_single_cluster(x,sdataframe,ls) for x in cluster]

def kmedoids(k,sdataframe,similarity): 
    medoids = randomise_medoids(k,sdataframe.shape[0])#returns a list of k random integers
    print(medoids)
    ls=find_similarity(sdataframe,cosine_similarity)# a 2d list containing similarity of each sentence with other sentence
    if(similarity!=cosine_similarity):
        if similarity==sopmi_similarity:
            ls=sopmi_similarity(sdataframe,ls)
        else:
            ls=senti_sim(sdataframe,ls)
    for i in range(10):
        cluster = make_clusters(sdataframe,medoids,ls)
        medoids = find_new_medoids(cluster,sdataframe,ls)
        #print(medoids)
        if medoids == [a for a,b in cluster]:#convergence point
            print(i,medoids)
            break
    return cluster

def attach_importance(cluster,sdataframe):# cluster is of form [[x,[]],[y,[]],....]
    for i,x in enumerate(cluster):
        for j,y in enumerate(x[1]):
            cluster[i][1][j] = tuple((y,sdataframe['CSS'][y]))
        cluster[i][0] = [x[0],sum([a for _,a in x[1]])]#stores the overall scores of the cluster
    
    fin_cluster = []
    for i,x in enumerate(cluster):
        cluster[i][0].append(cluster[i][1])
        fin_cluster.append(cluster[i][0])
        fin_cluster[i][2].sort(key=lambda x: x[1], reverse=True)#sorts each cluster on the basis of scores
    
    fin_cluster.sort(key=lambda x: x[1], reverse=True)#sorts cluster on the basis of their overall scores    
    return fin_cluster

In [20]:
df = create_dataframe()
sdf1 = get_sdf(df)#get dataframe of sentences of review text for all reviews
sdf2 = clean_sentences_dataframe(sdf1)#assigns pos tags in cleaned column

In [21]:
sent_imp = review_sentence_score(df,[0.3, 0.6, 0.1])

In [22]:
sdf = add_css(df, sdf2, sent_imp)

In [23]:
pwords = ["good","nice","excellent","positive","fortunate","correct","superior"]
nwords = ["bad","nasty","poor","negative","unfortunate","wrong","inferior"]

def create_vocabulary(sdataframe):#Of adjectives
    vocab = []
    for index,row in sdataframe.iterrows():
        list1 = row["Cleaned"]
        for w in list1:
            if w[1].startswith("J") and w[0] not in vocab and w[0] in adj:
                vocab.append(w[0])
    vocab = {k: v for v, k in enumerate(vocab)}
    return vocab

def create_bow_of_adj(vocab,sdataframe):
    #boolean vector
    bag = [[0 for x in range(len(vocab))] for y in range(sdataframe.shape[0])]
    for i,row in sdataframe.iterrows():
        list1 = row["Cleaned"]
        for w in list1:
            if w[1]=="JJ":
                if w[0] in vocab.keys():
                    bag[i][vocab[w[0]]] = 1
    bag = np.array(bag).T.tolist()
    return bag

In [24]:
vocab = create_vocabulary(sdf)
bow = create_bow_of_adj(vocab,sdf)
#each row corresponds to a word of vocab and column corresponds to sentence

In [25]:
hits_pwords=1 #product of all p(t_l) where t_l is positive words
for w in pwords:
    if w in vocab.keys():
        vec = bow[vocab[w]]
        hits_pwords = hits_pwords*sum(vec)
        # ultimately it will be product of all the observations in O(a_x)

hits_nwords=1
for w in nwords:#product of all p(t_l) where t_l is negative words
    if w in vocab.keys():
        vec = bow[vocab[w]]
        hits_nwords = hits_nwords*sum(vec)

def sopmi(word,hitsp,hitsn,bow,poswords,negwords):#returns O(a_x) sentiment strength score of an adjective
    hitspwords=1
    hitsnwords=1
    if word in vocab.keys():
        vecw = bow[vocab[word]]
        for w in poswords:
            if w in vocab.keys():
                vecp = bow[vocab[w]]
                hitspwords = hitspwords*(sum(x[0]*x[1] for x in zip(vecw,vecp))+1)
        
        for w in negwords:
            if w in vocab.keys():
                vecn = bow[vocab[w]]
                hitsnwords = hitsnwords*(sum(x[0]*x[1] for x in zip(vecw,vecn))+1)
        
        return math.log((hitspwords*hitsn)/(hitsp*hitsnwords))/math.log(2)
    else:
        return None
    
def sent_polarity(sentence,hitsp,hitsn,bag,poswords,negwords):
    #returns SP(S_j) by calculating O(S_j) 
    #here r is chosen to be 0.2
    sums = 0.0
    count = 0
    for w in sentence:
        k = sopmi(w,hitsp,hitsn,bag,poswords,negwords)
        if k is not None:
            count = count + 1
            sums = sums + k
    if count==0:
        return 0
    o_s = sums/count
    if o_s > 0.2:
        return 1
    elif abs(o_s)<=0.2:
        return 0.5
    else:
        return 0.0

def sopmi_similarity(sdf,ls):#Sentence similarity using SOPMI and content similarity using cosine similarity
    clean = sdf['Cleaned'].tolist()
    
    ls1=[] #Preprocessing the sentence polarity for each sentence.
    for i in range(len(clean)):
        ls1.append(sent_polarity(clean[i],hits_pwords,hits_nwords,bow,pwords,nwords))
    
    ans=[]
    
    for i in range(len (clean)):
        temp=[]
        for j in range(len(clean)):
            s1 = ls1[i]
            s2 = ls1[j]
            if s1==s2:
                temp.append(1*ls[i][j])
            elif s1==0.5 or s2==0.5:
                temp.append(0.5*ls[i][j])
            else:
                temp.append(0)
        ans.append(temp)
    return ans

In [26]:
vocab_split = set(vocab.keys())
cleaned_sentences = clean_sentences_list(sdf)

In [27]:
def cluster_to_dataframe(cluster,sdataframe):
    cluster_list = []
    # cluster: [centroid,overall score,[(sent index,sentence score),(),...]]
    # x[0]=first parameter
    for x in cluster[2]:
        cluster_list.append(tuple((sdataframe.iloc[x[0]]['SentenceIndex'],sdataframe.iloc[x[0]]['Cleaned'])))
    cluster_df = pd.DataFrame(cluster_list,columns = ['SentenceIndex','Cleaned'])
    return cluster_df

def postings_for_single_cluster(cdataframe,sdf):
    dictionary = dict()
    for index,rows in cdataframe.iterrows():
        #takes all nouns and appends sentenceindex to a list in which they are present
        for (word,p) in rows['Cleaned']:
            if p.startswith('N'):
                if word not in dictionary.keys():
                    dictionary[word] =([],0)
                dictionary[word][0].append(rows['SentenceIndex'])
                
    for key in dictionary.keys():
        temp=set(dictionary[key][0])
        dictionary[key] = (temp,len(temp))
    
    listd = []
    
    for k,v in dictionary.items():
        listd.append([k,v])
    listd.sort(key=lambda x: x[1][1], reverse=True)#sort according to number of occurrences in descending order
    
    for k in range(len(listd)):
        ls = []
        ls = sorted(listd[k][1][0],key=lambda x: sdf["CSS"][x], reverse=True)#sorting index according to css score
        listd[k][1] = (ls,listd[k][1][1])
        #css is sentence importance score calculated using Definition #7
        
    return listd #returns a posting list with words as keys and a tuple containing sentence index and its frequency

In [28]:
def postings_for_all_clusters(clusters,sdf):
    lst = []
    for cluster in clusters:
        cdf = cluster_to_dataframe(cluster,sdf)#returns a dataframe with senetenceIndex and Cleaned as columns
        words = postings_for_single_cluster(cdf,sdf)#a posting list [[word,([SentenceIndex],freq)],.....]
        lst.append(words)
    return lst#finally a list for all cluster with each element being a posting list in itself

def top_k_sentences(imp,sdf):
    sentences=""
    count=1
    for i in range(len(imp)):#for taking a cluster
        if imp[i]!=[]:
            sentences+=str(count)+'. '+sdf["Sentence"][imp[i][0][1][0][0]]+'\n'
            count+=1
        else:
            print('empty')
            #imp[i][j][1][0] gives us the list of sentence index
    return sentences #returns top k sentences from postings of all clusters 

In [31]:
# Steps to run GUI
# Keep files in the small_dataset folder
# Enter a number in the field.
# Click enter button
# Then select by which similarity measure you want to make clusters.
# After a minute it will show the result


window = Tk()
window.title("Reviews Summarizer")
 
window.geometry('900x600')
lbl = Label(window, text="How many clusters would you like to make?")
lbl.grid(column=0, row=0)

inp = Entry(window,width=10)
inp.grid(column=1, row=0)
def clicked():
    a=int(inp.get())
    return a

btn = Button(window, text="Enter",command=clicked)
btn.grid(column=2, row=0)
txt = scrolledtext.ScrolledText(window,width=100,height=30,wrap='word')
txt.grid(column=0,row=2,columnspan=3)

def cos_tkinter():
    txt.insert(INSERT,'Using COS Similarity\n')
    clst=kmedoids(clicked(),sdf,cosine_similarity)
    clst=attach_importance(clst,sdf)
    print(clst)
    important = postings_for_all_clusters(clst,sdf)
    important_sentences = top_k_sentences(important,sdf)
    txt.insert(INSERT,str(important_sentences))
    
def sopmi_tkinter():
    txt.insert(INSERT,'Using SOPMI Similarity\n')
    clst=kmedoids(clicked(),sdf,sopmi_similarity)
    clst=attach_importance(clst,sdf)
    important = postings_for_all_clusters(clst,sdf)
    important_sentences = top_k_sentences(important,sdf)
    txt.insert(INSERT,str(important_sentences))
    
def senti_tkinter():
    txt.insert(INSERT,'Using SENTI Similarity\n')
    clst=kmedoids(clicked(),sdf,senti_sim)
    clst=attach_importance(clst,sdf)
    important = postings_for_all_clusters(clst,sdf)
    important_sentences = top_k_sentences(important,sdf)
    txt.insert(INSERT,str(important_sentences))

rad1 = Radiobutton(window,text='cos', value=1,command=cos_tkinter)
rad2 = Radiobutton(window,text='sopmi', value=2,command=sopmi_tkinter)
rad3 = Radiobutton(window,text='wordsent', value=3,command=senti_tkinter)

rad1.grid(column=0, row=1)
rad2.grid(column=1, row=1)
rad3.grid(column=2, row=1)

window.mainloop()