## Determining the trending policy topics in Galway based on analysis of tweets from Local Media house (Newspaper or Radio station).

### Accessing the Posts on a tweets using twitter library:GetOldTweets3

In [0]:
import GetOldTweets3 as got
import time
import operator
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt

import gensim
import pyLDAvis.gensim
from gensim.utils import simple_preprocess
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [0]:
#Utility to create LDA model from twitter data.
class TwitterLDAUtil:
  
  def __init__(self, pages=None, num_topics=None, epoch=10, start_date="2018-09-01", end_date="2019-03-19", alpha=0.03, preload_tweets=None):
    
    np.random.seed(2018)
    self.stemmer = SnowballStemmer('english')
    self.start_date=start_date
    self.end_date=end_date
    self.alpha=alpha
    self.num_topics=num_topics
    self.epoch=epoch
    
    if preload_tweets is None:
      if pages is None:
        raise ValueError('Please provide a list of twitter handlers (:pages) to load data.')
      else:
        self._load_tweets(pages)
    else:
      self.all_tweets=preload_tweets
     
  #load tweets from handler pages.
  def _load_tweets(self, pages):
    
    self.all_tweets = []
    for page in pages:
      filter_criteria = got.manager.TweetCriteria()\
                           .setUsername(page)\
                           .setSince(self.start_date)\
                           .setUntil(self.end_date)

      tweets = got.manager.TweetManager.getTweets(filter_criteria)
      for tweet in tweets:
          self.all_tweets.append(tweet.text)
  
  #filter tweets based on domain
  def _filter_tweets(self, domain):
    
    self.tweet_data=[]
    for tweet in self.all_tweets:
      words = tweet.split(' ')
      d=0
      for word in words:
          word = word.lower()
          if(word in domain):
              d = 1
     
      if(d==1):
          self.tweet_data.append(tweet)
  
  #lemmatize a single token
  def _lemmatize_stemming(self,token):
    return self.stemmer.stem(WordNetLemmatizer().lemmatize(token, pos='v'))
  
  #removing single quotes
  def _remove_single_quotes(self,token):
    return re.sub("\'", "", token)

  #removes url token
  def _if_token_is_url(self,token):
    if 'http' in token:
      return True
    else:
      return False
          
  def _preprocess_data(self, tweet):
    result = []
    
    #removing stop words, removing punctuations and unnecessary characters and finally lemmatising.
    for token in gensim.utils.simple_preprocess(tweet, deacc=True):
        
        #removes url token
        if self._if_token_is_url(token):
            break
        
        token = self._remove_single_quotes(token)
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(self._lemmatize_stemming(token))
            
    return result
  
  def _get_data_set(self):
    
    domain_data = list(set(self.tweet_data))
    df = pd.DataFrame(columns=['Tweet','Index'])
    df['Tweet'] = domain_data
    df['Index'] = df.index
    return df
    
  #create LDA model with given parameters
  def build(self, domain, doc_model="bag_of_words"):
    
    if self.num_topics is None:
      raise ValueError('num_topics cannot be None.')
    else:
      self._filter_tweets(domain)
      df=self._get_data_set()
      self.df_preprocessed = df['Tweet'].map(self._preprocess_data)
      self.dictionary = gensim.corpora.Dictionary(self.df_preprocessed)
      self.dictionary.filter_extremes(no_above=15)
      self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in self.df_preprocessed]
      self.lda_model = gensim.models.LdaMulticore(corpus=self.bow_corpus, num_topics=self.num_topics, chunksize=100, id2word=self.dictionary, 
                                                  passes=self.epoch, workers=4, alpha=self.alpha)
      return copy.deepcopy(self)

  def display_topics(self):
    for idx, topic in self.lda_model.print_topics(-1):
      print('Topic: {} \nWords: {}'.format(idx, topic))
    
  #calculate perplexity and coherence.
  #More the coheremce score, better is the model.
  #Less the perplexity, better is the topic selection.
  def get_perplexity_and_coherence(self):
    
    # Compute Perplexity
    print('\nPerplexity: ', self.lda_model.log_perplexity(self.bow_corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=self.lda_model, texts=self.df_preprocessed, dictionary=self.dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    print()
  
  def visualise_topics(self):
    
    # Visualize the topics
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(self.lda_model, self.bow_corpus, self.dictionary)
    return vis
    
  #Get summary of twitter data, group by topic and keywords. 
  def get_summary(self,domain_name):
    topic_summary_ds = pd.DataFrame()
    for i, row in enumerate(self.lda_model[self.bow_corpus]):
        for j, (topic_num, prop_topic) in enumerate(row):
           wp = self.lda_model.show_topic(topic_num)
           keywords = ", ".join([word for word, prop in wp])
           topic_summary_ds = topic_summary_ds.append(pd.Series([int(topic_num), keywords]), ignore_index=True)
        
    topic_summary_ds = topic_summary_ds.reset_index()
    topic_summary_ds.columns = ['Tweet_No','Topic_no','Keywords']
    topic_summary_ds = topic_summary_ds.groupby(['Topic_no','Keywords'], as_index=True).agg(['count'])
    topic_summary_ds['Domain']=[domain_name] * self.num_topics
    topic_summary_ds.columns = ['No_of_tweets','Domain']
    return topic_summary_ds

### Creating keywords from public open data sources via manual inspection of data samples. And, leveraging existing topic modelling technique - LDA

In [0]:
#Key domain areas
demographic = ["population","male","female","men","women","man","woman","age","birth","years","household","domestic"]

health_education = ["patient","hospital","health","school","college","university","primary","secondary","pupil","special",
                    "community","comprehensive","department","education","skill","level"]

crime_emergency = ["crime","accident","murder","robbery","theft","assault","violence","injured","kill","die","collision","safety","rate","emergency",
                   "harassment", "sexual", "road", "offence", "rescue", "removal", "ambulance", "flooding",
                   "flood", "safety"]

government = ['brexit','#brexit','policy','partnership','project','housing','grants']

arts_culture = ['2020','#stpatricksfest','@stpatricksfest','#stpatricksday','patrick\'s', 'festival', 'celebration','halloween','arts','culture',
                '#galway2020', '#wavemakers']

transport = ['real-time','bus','routes','route','delays','fares','travel','streets','road']

#Twitter handlers to obtain data/posts
pages = ["Galwaybayfmnews", "GalwayCoCo", "GalwayCityCo", "Ctribune", "galwaypage","StPatricksFest", "rtenews","galway2020",
         "galwaydrivers","Buseireann"]

#Create a utility model with fixed no. of topics, epochs and learning rate alpha.
model=TwitterLDAUtil(pages, num_topics=3, epoch=10, alpha=0.03)

#load all domain models
demographic_model=model.build(domain=demographic, doc_model="bag_of_words")
health_education_model=model.build(domain=health_education, doc_model="bag_of_words")
crime_emergency_model=model.build(domain=crime_emergency, doc_model="bag_of_words")
government_model=model.build(domain=government, doc_model="bag_of_words")
arts_culture_model=model.build(domain=arts_culture, doc_model="bag_of_words")
transport_model=model.build(domain=transport, doc_model="bag_of_words")

In [0]:
#display topic kewords with their weightage
transport_model.display_topics()

Topic: 0 
Words: 0.085*"arriv" + 0.079*"regard" + 0.043*"kind" + 0.030*"stop" + 0.029*"rout" + 0.025*"apolog" + 0.024*"number" + 0.018*"road" + 0.016*"thank" + 0.015*"travel"
Topic: 1 
Words: 0.093*"road" + 0.030*"galway" + 0.020*"rout" + 0.019*"citi" + 0.017*"twitter" + 0.013*"work" + 0.013*"cork" + 0.012*"delay" + 0.011*"safeti" + 0.011*"advis"
Topic: 2 
Words: 0.069*"servic" + 0.062*"regard" + 0.051*"rout" + 0.050*"oper" + 0.033*"delay" + 0.030*"kind" + 0.020*"eireann" + 0.019*"weather" + 0.018*"updat" + 0.017*"condit"


In [0]:
#Display Perplexity and Coherence
arts_culture_model.get_perplexity_and_coherence()


Perplexity:  -5.692971435836454

Coherence Score:  0.28828999536412275



In [0]:
#Visualising topics and keywords
arts_culture_model.visualise_topics()

### Analysis of the relevant posts to determine to provide required information (counts of posts per policy area) 
### Storing the summary into a csv file.

In [0]:
demographic_summary = demographic_model.get_summary("demographic")
health_education_summary = health_education_model.get_summary("health_education")
crime_summary = crime_emergency_model.get_summary("crime_emergency")
government_summary = government_model.get_summary("government")
arts_culture_summary = arts_culture_model.get_summary("arts_culture")
transport_summary = transport_model.get_summary("transport")

final_summary = pd.concat([demographic_summary, health_education_summary, crime_summary, government_summary, arts_culture_summary, transport_summary])
final_summary.to_csv("/content/drive/My Drive/summary.csv", sep=',')