## **Topic Modeling** 

**@author:** Gonçalo Mateus

## **0. Mount Drive access**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

## **1. Importing Data**


In [None]:
#--------------------------------------------#
#                   Imports                  #
#--------------------------------------------#

import pandas as pd
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import re


#--------------------------------------------#
#                 Load Data                  #
#--------------------------------------------#

df = pd.read_csv(r'/content/drive/MyDrive/Scripts/Sentiment Analysis and Topic Modelling Study/Topic Modeling/AllMediaTweets_DF(Filter by keywords) - Sentiment_ALL(with_preprocess).csv', 
#                  parse_dates=['created_at'],
#                  date_parser=custom_date_parser,
                  low_memory=False)



## **2. Initial Process**


In [None]:
from ast import literal_eval
df['Lemmatization'] = df['Lemmatization'].apply(literal_eval)
df["final_process"] = list(map(lambda x: ' '.join([str(elem) for elem in x]) , df['Lemmatization']))

df["final_process_novobanco_out"] = list(map(lambda x: x.replace('novo banco', ''), df['final_process']))
df["final_process_novobanco_out"] = list(map(lambda x: x.replace('novobanco', ''), df['final_process_novobanco_out']))
df["final_process_novobanco_out"] = list(map(lambda x: x.replace('dos', ''), df['final_process_novobanco_out']))

remove_rt = lambda x: re.sub('RT @\w+: '," ", x)
rt = lambda x: re.sub("([,0-9A \t])"," ",x)

df["new_text"] = df.final_process_novobanco_out.map(remove_rt).map(rt)

def remove_accents(raw_text):
    """Removes common accent characters.
    """

    raw_text = re.sub(u"[àáâãäå]", 'a', raw_text)
    raw_text = re.sub(u"[èéêë]", 'e', raw_text)
    raw_text = re.sub(u"[ìíîï]", 'i', raw_text)
    raw_text = re.sub(u"[òóôõö]", 'o', raw_text)
    raw_text = re.sub(u"[ùúûü]", 'u', raw_text)
    raw_text = re.sub(u"[ýÿ]", 'y', raw_text)
    raw_text = re.sub(u"[ß]", 'ss', raw_text)
    raw_text = re.sub(u"[ñ]", 'n', raw_text)
    return raw_text 

alltext = df["new_text"] 
newtext = []

for x in alltext:
  newstring = remove_accents(x)
  newtext.append(newstring)

df["new_text"]  = newtext
df["new_text"] = list(map(lambda x: x.replace('  ', ' '), df['new_text']))
df["new_text"] = list(map(lambda x: x.replace('  ', ' '), df['new_text']))

numberWords = []
for x in df["new_text"]:
    numberWords.append(len(x.split()))

df["numberOfWords"] = numberWords
df = df[df["numberOfWords"] > 0]

df["new_text"].to_csv(r'/content/drive/MyDrive/Scripts/Sentiment Analysis and Topic Modelling Study/Topic Modeling/Data_to_topic1.txt', header=None, index=None,)

## **3. Statistics**


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

countVectorizer = CountVectorizer() 
countVector = countVectorizer.fit_transform(df["new_text"])
print('{} Number of reviews has {} words'.format(countVector.shape[0], countVector.shape[1]))

#print(countVectorizer.get_feature_names())

count_vect_df = pd.DataFrame(countVector.toarray(), columns=countVectorizer.get_feature_names())
print(count_vect_df.head())

count = pd.DataFrame(count_vect_df.sum())
countdf = count.sort_values(0,ascending=False).head(30)
print(countdf[0:21])
countdf = countdf.reset_index()

#%% Top 20 words occurrences in tweets related to novobanco
plt.figure( figsize = ( 14, 7))

ax = sns.barplot(x="index", y=0, data=countdf[0:20],
                 palette="Blues_r")

plt.xticks(rotation=60)
plt.title("Top 20 words occurrences in tweets related to novobanco", size=18)
plt.xlabel('Words', size=14)
plt.ylabel('Number of occurrences', size=14)
plt.xticks(size=13)
plt.yticks(size=13)

## **4. Topic modeling using DMM with Gibbs Sampling**

**Note:** For this part we used an implementation present in https://github.com/CAIR-ZA/GPyM_TM

In [None]:
!pip install GPyM-TM==3.0.0


In [None]:
from GPyM_TM import GSDMM
import pandas as pd


def runTopic(nTopics, alpha, beta, nTopWords, iters, resultsy, topicCoehrence):
  corpus = GSDMM.load_file("/content/drive/MyDrive/Scripts/Sentiment Analysis and Topic Modelling Study/Topic Modeling/Data_to_topic.txt")

  #data_dmm = GSDMM.DMM(corpus, nTopics) # Initialize the object, with default parameters.
  #data_dmm = GSDMM.DMM(corpus, nTopics, alpha, beta, nTopWords, iters) # Initialize the object.

  data_dmm = GSDMM.DMM(corpus, nTopics = nTopics, alpha = alpha, beta = beta, nTopWords = nTopWords, iters=iters)



  data_dmm.topicAssigmentInitialise() # Performs the inital document assignments and counts
  data_dmm.inference()

  psi, theta, selected_psi, selected_theta = data_dmm.worddist() # Determines and stores the psi, theta and selected_psi and selected_theta values
    
  finalAssignments = data_dmm.writeTopicAssignments() # Records the final topic assignments for the documents

  coherence_topwords = data_dmm.writeTopTopicalWords(finalAssignments) # Record the top words for each document

  score = data_dmm.coherence(coherence_topwords, len(finalAssignments)) #Calculates and stores the coherence

  print("Final number of topics found: " + str(len(finalAssignments)))
  
  resultsy.append(len(finalAssignments))
  topicCoehrence.append(score)

  # We can then have to variables in which the selected theta's and psi are saved
  selected_psi
  selected_theta


#starting k
#paramsToChoose = [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 400, 500, 600, 700]

paramsToChoose = [0.7]

resultsx = []
resultsy = []
topicCoehrence = []

for x in paramsToChoose:
  resultsx.append(x)
  nTopics = 400

  alpha = x
  beta = 0.3
  nTopWords = 10
  iters = 100
  runTopic(nTopics, alpha, beta, nTopWords, iters, resultsy, topicCoehrence);

#Save
title = "Influence of iteractions"
data = {'startingK':  resultsx,
        'endingK': resultsy,
        'coeherence': topicCoehrence,
        }

data = pd.DataFrame(data)

data.to_csv(r'/content/drive/MyDrive/Scripts/Sentiment Analysis and Topic Modelling Study/Topic Modeling/'+ title + '.csv')