#### Python function, that takes in a json request and output the label names along with uniqueid, text, sentiment score in a json

In [21]:
import json
import pandas as pd
import numpy as np
import re
import sys
import nltk
from collections import defaultdict
from nltk.corpus import stopwords, sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from stopwords_list import stop_words_list
from sklearn.decomposition import LatentDirichletAllocation
import collections
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
from textblob import TextBlob, Word


Data preprocessing


In [22]:
f = open('Response.json') 
df1=json.load(f)
keys_to_extract = ["uniqueId", "text"]
new=[]

for i in range (0,len(df1['responseList'])):
    d=[{key: df1['responseList'][i][key] for key in keys_to_extract}]
    a=pd.DataFrame.from_dict(d) 
    new.append(a)
df=pd.concat(new, ignore_index=True)    

Call function 'topic_modelling with **dataframe** and **no.of topic** to be generated'

In [23]:
def topic_modelling(df,n):
    

    # case text as lowercase, remove punctuation, remove extra whitespace in string and on both sides of string
    df['remove_lower_punct'] = df['text'].str.lower().str.replace("'", '').str.replace('[^\w\s]', ' ').str.replace(" \d+", " ").str.replace(' +', ' ').str.strip()
    
    # apply sentiment analysis
    analyser = SentimentIntensityAnalyzer()
    
    sentiment_score_list = []
    sentiment_label_list = []

    for i in df['remove_lower_punct'].values.tolist():
        sentiment_score = analyser.polarity_scores(i)

        if sentiment_score['compound'] >= 0.05:
            sentiment_score_list.append(sentiment_score['compound'])
            sentiment_label_list.append('Positive')
        elif sentiment_score['compound'] > -0.05 and sentiment_score['compound'] < 0.05:
            sentiment_score_list.append(sentiment_score['compound'])
            sentiment_label_list.append('Neutral')
        elif sentiment_score['compound'] <= -0.05:
            sentiment_score_list.append(sentiment_score['compound'])
            sentiment_label_list.append('Negative')

    df['sentiment'] = sentiment_label_list
    df['sentiment score'] = sentiment_score_list
    
    # tokenise string
    df['tokenise'] = df.apply(lambda row: nltk.word_tokenize(row[1]), axis=1)
    # initiate stopwords from nltk
    stop_words = stopwords.words('english')
    # add additional missing terms
    stop_words.extend(stop_words_list) 
    # remove stopwords
    df['remove_stopwords'] = df['tokenise'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # initiate nltk lemmatiser
    wordnet_lemmatizer = WordNetLemmatizer()
    # lemmatise words
    df['lemmatise'] = df['remove_stopwords'].apply(lambda x: [wordnet_lemmatizer.lemmatize(y) for y in x]) 
    
    # initialise the count vectorizer
    vectorizer = TfidfVectorizer(analyzer = 'word', ngram_range = (3, 4))
    # join the processed data to be vectorised
    vectors = []
    for index, row in df.iterrows():
        vectors.append(", ".join(row[6]))
    vectorised = vectorizer.fit_transform(vectors)
    
    # initisalise LDA Model
    lda_model = LatentDirichletAllocation(n_components = n, # number of topics
                                      random_state = 10,          # random state
                                      evaluate_every = -1,      # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,              # Use all available CPUs
                                     )

    lda_output = lda_model.fit_transform(vectorised)
    # column names
    topic_names = [ str(i) for i in range(1, lda_model.n_components + 1)]
    # make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topic_names)
    # get dominant topic for each document
    dominant_topic = (np.argmax(df_document_topic.values, axis=1)+1)
    df_document_topic['Dominant_topic'] = dominant_topic
    # join to original dataframes
    df = pd.merge(df, df_document_topic, left_index = True, right_index = True, how = 'outer')
    
    # index names
    docnames = ['Doc' + str(i) for i in range(len(df))]
    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 5), columns=topic_names, index=docnames)
    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    # Topic-Keyword Matrix
    df_topic_keywords = pd.DataFrame(lda_model.components_)
    # Assign Column and Index
    df_topic_keywords.columns = vectorizer.get_feature_names()
    df_topic_keywords.index = topic_names
    df_topic_no = pd.DataFrame(df_topic_keywords.idxmax())
    df_scores = pd.DataFrame(df_topic_keywords.max())
    tmp = pd.merge(df_topic_no, df_scores, left_index=True, right_index=True)
    tmp.columns = ['topic', 'relevance_score']
    
    all_topics = []

    for i in tmp['topic'].unique():    
        tmp_1 = tmp.loc[tmp['topic'] == i].reset_index()
        tmp_1 = tmp_1.sort_values('relevance_score', ascending=False).head(1)

        tmp_1['topic'] = int(tmp_1['topic']) 

        tmp_2 = []
        tmp_2.append(tmp_1['topic'].unique()[0])
        tmp_2.append((tmp_1['index'].unique()))
        all_topics.append(tmp_2)

    all_topics = pd.DataFrame(all_topics, columns=['Dominant_topic', 'topic_name'])
    
    df2=df[['uniqueId','text','sentiment','sentiment score','Dominant_topic']]
    frequent_features=list(''.join(i) for i in all_topics.topic_name)
    
    #absa_list is the dictionary containing all the reviews for each topic
    absa_list = dict()
    for f in frequent_features:
        # For each comment
        absa_list[f] = list()
        for comment in df.text:
            blob = TextBlob(comment)
            # For each sentence of the comment
            for sentence in blob.sentences:
                # Search for frequent feature 'f'
                q = '|'.join(f.split())
                if re.search(r'\w*(' + str(q) + ')\w*', str(sentence)):
                    absa_list[f].append(sentence)
                    
    #final dataframe with all required columns                
    final=pd.merge(df2,all_topics,on=['Dominant_topic'])
    li=[]
    res2={}
    for i,j in zip(final.Dominant_topic, final.topic_name):
    
        d={}
        a=''.join(j)
        d[a]=final[['text','uniqueId','sentiment','sentiment score']][final.Dominant_topic==i].to_dict('records')
        res2.update(d)   
        
    return res2    

In [24]:
final_dict=topic_modelling(df,20)

In [25]:
final_dict

{'location lot amenities spacious': [{'text': 'Wonderful experience',
   'uniqueId': '1816GOIBO',
   'sentiment': 'Positive',
   'sentiment score': 0.5719},
  {'text': 'Nice location,\nlot of amenities,spacious rooms,\nparking problem is thr..',
   'uniqueId': '1804GOIBO',
   'sentiment': 'Neutral',
   'sentiment score': 0.0258},
  {'text': 'very good property..excellent staff n hospitality.enjoyed our stay a lot.kids park n toy room added an extra perk to our little one.inside n outside decor is mesmerising.\nawsm maintainence of garden n lawn area.lots of time u can spend in the property itself..baby friendly cooking n custom made baby food is a great advantage if u r travelling with kids.',
   'uniqueId': '1828GOIBO',
   'sentiment': 'Positive',
   'sentiment score': 0.9626},
  {'text': 'great place to stay',
   'uniqueId': '1835GOIBO',
   'sentiment': 'Positive',
   'sentiment score': 0.6249},
  {'text': 'not so good',
   'uniqueId': '1841GOIBO',
   'sentiment': 'Negative',
   'sen