In [None]:
CWD = ""
TWITTER_YAML = ''
SAVED_TO = ""

# Install Package

In [None]:
!pip install streamlit==1.14.0
!pip install --no-cache-dir --upgrade tweepy
!pip install gensim
!pip install pyngrok
!pip install joblib
!pip install contractions
!pip install pyspellchecker
!pip install fasttext
!pip install tomotopy
!pip install altair
!pip install -U scikit-learn
!pip install scikeras
!pip install tensorflow_text
!pip install --upgrade scipy
!pip install --upgrade numba 

# Main code

In [None]:
%%writefile app.py

#####################
CWD = ""
TWITTER_YAML = ''
SAVED_TO = ""

#####################
import json
with open(TWITTER_YAML, 'r') as file:
  yamljson = file.read()

yamljson = yamljson.replace("\n", "")
yamljson = yamljson.replace("\t", "")
yamljson = json.loads(yamljson)

TWITTER_BEARER_TOKEN = yamljson['twitter']['Bearer_Token']

#####################
import tweepy
import joblib
import html
import re
import contractions
import string
import nltk
import fasttext 
import time
import pandas as pd
import tomotopy as tp
import streamlit as st
import numpy as np
import altair as alt
import joblib
import tensorflow as tf
import gensim.corpora.dictionary as Dictionary
import gensim.models.tfidfmodel as TfidfModel

from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances
from nltk.collocations import *
from nltk.tag import HunposTagger

SEPARATOR = "<new_tweet>"
CHART_WIDTH = 702
CHART_HEIGHT = 395
PRODUCT_REVIEW = "product_review_3"
SENTIMENT = "sentiment4"
DEBUG = False

#####################Wrapper function#####################
def detect_lang_wrap(tweet):
  if not pd.isna(tweet):
    return detect_lang.predict(tweet)[0][0]
  return np.nan

def html_escape_helper(tweet):
  if not pd.isna(tweet): #html escape does not accept nan
    return html.unescape(tweet)
  else:
    return tweet

def noun_only(x):
  # https://towardsdatascience.com/6-tips-to-optimize-an-nlp-topic-model-for-interpretability-20742f3047e2
  filtered = [word[0] for word in x if word[1] in [b'NN', 'NN']]
  return ' '.join(filtered)  

def bigram_filter(bigram):
    stop_word_list = stopwords.words('english')
    tag = nltk.pos_tag(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if bigram[0] in stop_word_list or bigram[1] in stop_word_list:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram:
        return False
    return True

def replace_ngram(x, bigrams):
    for gram in bigrams:
        x = x.replace(gram, '_'.join(['_'] + gram.split() + ['_']))
    return x

def return_topic(x, topic):
  if not pd.isna(x):
    return topic[int(x)]


#####################Replace function#####################
def replace(series, action, parameter = None):
  
  #1. standardize
  series = series.replace("", np.nan)
  series = series.apply(str)
  series = series.str.strip()
  saved_index = series.index
  passage = series.str.cat(sep = " " + SEPARATOR + " ", na_rep = '') #ignore nan + it must " <new_tweet> " at least once
  passage = re.sub(r"\s+", " ", passage)
  
  #2. action
  passage = action(passage, parameter) 

  #3. split back into row
  output = pd.Series(passage.split(SEPARATOR))
  output = output.str.strip()
  output = output.replace("nan", np.nan)
  output.index = saved_index
  return output

def fix_cannot(passage, parameter):
  return re.sub("cannot", "not", passage)

def fix_contractions(passage, parameter):
  return contractions.fix(passage)

def get_unique_token_from_string(passage, parameter):
  return list(Counter(passage.split(" ")).keys())

def stop_word(passage, parameter):  
  #1. Build dictionary
  vocab = get_unique_token_from_string(passage, parameter)
  stop_words = stopwords.words('english') 
  stop_words.remove("no")
  stop_words.remove("not")
  stop_words.remove("nor")

  #2. Remove
  stop_words = Counter(stop_words) #Speed up next line
  passage = [word for word in passage.split(" ") if word not in stop_words] #str.split(" ") and str.split() are different
  passage = ' '.join(passage)
  return passage

def unknown_word(passage, parameter):
  #1. Build dictionary
  vocab = get_unique_token_from_string(passage, parameter)
  spell = SpellChecker()
  misspelled = spell.unknown(vocab)
  if "nan" in misspelled:
    misspelled.remove("nan")
  misspelled.remove(SEPARATOR) 

  #2. Remove
  misspelled = Counter(misspelled) #Speed up next line
  passage = [word for word in passage.split(" ") if word not in misspelled] #str.split(" ") and str.split() are different
  passage = ' '.join(passage)
  return passage

def remove_not(passage, parameter):
  remove_word = Counter(["no", "not", "nor"])
  passage = [word for word in passage.split(" ") if word not in remove_word] #str.split(" ") and str.split() are different
  passage = ' '.join(passage)
  return passage

def reduce_redundancy(passage, parameter):
  
  #1. Build dictionary
  word_counter = Counter(passage.split(" "))
  word = list(word_counter.keys())

  #2. Generating word vector
  word_vector_main = word2vec_vector(word)
  df_word = {}
  for i in range(0, len(word)):
    word_vector = word_vector_main[i][0]
    if not np.all(word_vector == 0):
      df_word[word[i]] = word_vector
  df_word = pd.DataFrame.from_dict(df_word, orient = "index")

  #3. Get clustering of each word
  clustering = AgglomerativeClustering(distance_threshold = parameter["synonym"], n_clusters  = None, metric = "cosine", linkage = "complete").fit(df_word)
  df_word = df_word.join(pd.Series(clustering.labels_, index = df_word.index ,name = "Clusters"))
  df_word = df_word.join(pd.DataFrame.from_dict(word_counter, orient = "index", columns = ["sum_tfidf"]))

  #4. Label cluster with max tfidf
  n_clusters = max(df_word.Clusters) + 1
  cluster_label = {}

  test = df_word.sort_values(by = ["Clusters", "sum_tfidf"], ascending = [True, False])
  test = test.drop_duplicates(subset = "Clusters", keep = "first")

  for i in range(0, len(test["Clusters"])):
    cluster_label[test["Clusters"][i]] = test.index[i]

  df_word['cluster_label'] = df_word['Clusters'].replace(cluster_label)

  df_word = df_word.reset_index()
  dictionary = {}
  for i in range(0, df_word.shape[0]):
    dictionary[df_word.at[i,"index"]] = df_word.at[i,"cluster_label"]

  #5. Replacing word with synonyms
  dictionary['<new_tweet>'] = '<new_tweet>'
  dictionary[''] = ''
  passage =  ' '.join([dictionary[word] for word in passage.split(" ") if word in dictionary.keys()])

  df_word[["index", "Clusters", "sum_tfidf", 'cluster_label']].to_csv(CWD + "data/df_word.csv")
  return passage

def out_of_vocabulary(passage, parameter):

  #1. Built vocabulary
  vocab = parameter["vocab"]
  word_counter = Counter(passage.split(" "))
  current_vocab = list(word_counter.keys())
  current_vocab.remove(SEPARATOR)
  word = current_vocab + list(vocab)

  #2. Generating word vector
  word_vector_main = word2vec_vector(word)
  df_word = {}
  for i in range(0, len(word)):
    word_vector = word_vector_main[i][0]
    if not np.all(word_vector == 0):
      df_word[word[i]] = word_vector
  df_word = pd.DataFrame.from_dict(df_word, orient = "index")

  #3. Generating Distance matrix and Get nearest word
  dist_mat = pd.DataFrame(cosine_distances(df_word), columns = df_word.index, index = df_word.index)
  dist_mat = dist_mat[current_vocab]
  dist_mat = dist_mat.loc[vocab]

  dist_mat[dist_mat > parameter["synonym"]] = np.nan
  dictionary = pd.DataFrame(dist_mat.idxmin(), columns = ["svm_word"])
  dictionary["svm_word"] = dictionary["svm_word"].replace(np.nan, "")
  convert = {}
  for i in range(0, dictionary.shape[0]):
    convert[dictionary.index[i]] = dictionary.iloc[i]['svm_word']

  #4. Replacing word with synonyms
  convert['<new_tweet>'] = '<new_tweet>'
  convert[''] = ''
  passage =  ' '.join([convert[word] for word in passage.split(" ") if word in convert.keys()])
  return passage
  
def stemming(passage, parameter):
  #1. Build dictionary
  vocab = get_unique_token_from_string(passage, parameter)
  term = pd.DataFrame({"index": vocab, "stem" :vocab})
  stemmer = PorterStemmer()
  term["stem"] = term["stem"].apply(stemmer.stem)
  term["len"] = [len(i) for i in term["index"]]

  #2. Repair Stem
  test = term.sort_values(by = ["stem", "len"])
  test = test.drop_duplicates(subset = "stem", keep = "first")
  test = test.reset_index(drop = True)
  dictionary = {}
  for i in range(0, test.shape[0]):
    dictionary[test.at[i,"stem"]] = test.at[i,"index"]
  term['stem'] = [dictionary[word] for word in term['stem']]
  dictionary = {}
  for i in range(0, term.shape[0]):
    dictionary[term.at[i,"index"]] = term.at[i,"stem"]
    
  #3. Stemming
  dictionary['<new_tweet>'] = '<new_tweet>'
  dictionary[''] = ''
  return ' '.join([dictionary[word] for word in passage.split(" ") if word in dictionary.keys()])

#####################Core function#####################
def fetch_tweet(search_query, debug = False):
  start = time.time()
  if debug:
    df = pd.read_csv(CWD + "data/1.data.csv")
    df = df.rename(columns = {"tweet": "text"})
    df[PRODUCT_REVIEW] = df[PRODUCT_REVIEW].replace({"Product Review" : 1, "Not Product Review": 0})
    df[SENTIMENT] = df[SENTIMENT].replace({"Positive emotion" : 2, "No emotion toward brand or product": 1, "Negative emotion": 0})
  else:
    query = search_query + " -is:retweet lang:en"
    client = tweepy.Client(TWITTER_BEARER_TOKEN, return_type = dict)
    tweet_dict = tweepy.Paginator(client.search_recent_tweets, query = query, max_results = 100).flatten(limit = 1000)
    df = pd.DataFrame([tweet for tweet in tweet_dict])
  print("0. Fetch Tweet: ", time.time() - start)
  return df

def text_cleaning(series):

  start = time.time()

  ##Fix encoding, remove formatting and non-ASCII character
  series = series.str.encode("utf-8")
  series = series.str.decode("utf-8") 
  series = series.apply(html_escape_helper)
  series = series.str.replace("\n", "", regex = False)
  series = series.str.replace(r'[^\x00-\x7F]+', " ", regex = True)
  
  ##Remove tweet character
  series = series.str.replace(r"RT @\w+", "", regex = True)
  series = series.str.replace(r"via @\w+", "", regex = True)
  series = series.str.replace(r"RT", "", regex = False)
  series = series.str.replace(r"@mention", "", regex = False)
  series = series.str.replace(r'\w*\d\w*', "", regex = True)

  ##Remove URL
  series = series.str.replace(r"{link}", "", regex = False)
  series = series.str.replace(r"http\S+", "", regex = True)
  series = series.str.replace(r"(bit.ly)\S+", "", regex = True)

  print("1. Text Cleaning: ", time.time() - start)

  return series.copy()

def text_preprocessing(series):

  table = {'!': ' ', '"': ' ', '#': '', '$': ' ', '%': ' ', '&': ' ', "'": ' ', '(': ' ', ')': ' ', '*': ' ', '+': ' ', ',': ' ', '-': '', '.': ' ', '/': ' or ', 
        ':': ' ', ';': ' ', '<': ' ', '=': ' ', '>': ' ', '?': ' ', '@': ' ', '[': ' ', '\\': ' ', ']': ' ', '^': ' ', '_': '', '`': ' ', '{': ' ', '|': ' ', 
        '}': ' ', '~': ' '}
      
  start = time.time()
  #Lower case
  series = series.str.lower()

  #Fix contraction
  series = series.str.replace(r"'s", "", regex = False)
  series = replace(series, fix_contractions)

  #Remove punctuation (This must be the last step)
  series = series.str.translate(str.maketrans(table))
  
  #Remove non English Tweet
  temp = pd.concat([series, series.apply(detect_lang_wrap)], axis = 1)
  temp.columns = ["cleaned_tweet", "lang"]
  temp.loc[temp['lang'] != "__label__en", "cleaned_tweet"] = np.nan 

  #Get noun
  temp["noun"] = temp["cleaned_tweet"].replace("", np.nan)
  temp["noun"] = temp["noun"].apply(str)
  temp["noun"] = temp["noun"].str.strip()
  temp["noun"] = temp["noun"].str.replace(r"\s+", " ", regex = True)
  temp["noun"] = temp["noun"].str.split(" ")
  temp["noun"] = temp["noun"].apply(hpt.tag)
  temp["noun"] = temp["noun"].apply(noun_only)

  #Remove unknown word spelling and stopword 
  temp['cleaned_tweet'] = replace(temp['cleaned_tweet'], stop_word) 
  temp['cleaned_tweet'] = replace(temp['cleaned_tweet'], unknown_word)

  print("2. Text preprocessing: ", time.time() - start)

  return temp["noun"], temp['cleaned_tweet']

def cleanup(series):
  start = time.time()

  series = replace(series, fix_cannot) 
  series = series.str.replace(r"\s+", " ", regex = True)
  series = series.str.strip()

  print("3. Cleanup: ", time.time() - start)

  return series

def assign_pr(df, model, vocab, predict_proba = False):

  start = time.time()
  df['pr_tweet'] = replace(df['cleaned_tweet'], out_of_vocabulary, {"vocab": vocab, "synonym": 0.38})
  
  vectorizer = TfidfVectorizer()
  X = vectorizer.fit_transform(df["pr_tweet"]).toarray()
  X = pd.DataFrame(X, columns = vectorizer.get_feature_names_out())
  not_in_X = [i for i in vocab if i not in X.columns]
  not_in_X = pd.DataFrame(np.zeros((X.shape[0], len(not_in_X))), columns = [i for i in vocab if i not in X.columns])
  X = pd.concat([X, not_in_X], axis = 1 )
  X = X.groupby(X.columns, axis = 1).agg(sum)
  X = X[vocab]
  if predict_proba:
    if type(model).__name__ in ["SVC"]:
      df[PRODUCT_REVIEW] = model.decision_function(X)
    elif type(model).__name__ == "SGDClassifier":
      if model.loss == "hinge":
        df[PRODUCT_REVIEW] = model.decision_function(X)
      else:
        df[PRODUCT_REVIEW] = model.predict_proba(X)[:,1]
    else:
      df[PRODUCT_REVIEW] = model.predict_proba(X)[:,1]
  else:
      df[PRODUCT_REVIEW] = model.predict(X)
  print("3. Assign Review: ", time.time() - start)
  return df.copy()

def assign_sentiment(df, model, predict_proba = False):
  start = time.time()
  Y = model.predict(df['cleaned_tweet'], verbose = 0)
  print("3. Assign Sentiment: ", time.time() - start)
  if predict_proba:
    return Y
  else:  
    Y = [np.argmax(i) for i in Y]
    df[SENTIMENT] = Y
    return df.copy()

def assign_topic(X, model, test = False):
  start = time.time()
  #detect bigram
  bigram_measures = nltk.collocations.BigramAssocMeasures()
  finder = BigramCollocationFinder.from_documents([str(i).split() for i in X])
  finder.apply_freq_filter(7)
  if finder.score_ngrams(bigram_measures.pmi) != []: #This mean no ngram found
    bigram_list, scores = zip(*finder.score_ngrams(bigram_measures.pmi))
    bigram_list = finder.score_ngrams(bigram_measures.pmi)
    bigrams = [' '.join(i[0]) for i in bigram_list if bigram_filter(i[0]) if i[1] > 2]
    X = X.apply(replace_ngram, args = (bigrams,))
  
  #model
  text_obj = tp.utils.Corpus()
  texts = [text_obj.add_doc(str(i).split()) for i in X]
  mdl = model(tw = tp.TermWeight.IDF, k = 10, corpus = text_obj, seed = 42)
  mdl.burn_in = 100
  mdl.train(1000)
  
  #convert mdl.used_vocab to dict
  used_vocabs = {}
  for i in range(0, len(mdl.used_vocabs)):
    used_vocabs[mdl.used_vocabs[i]] = i

  #build topic-term matrix
  topic_word_df = pd.DataFrame(columns = mdl.used_vocabs)
  for i in range(mdl.k):
    topic_word_df = topic_word_df.append(pd.Series(mdl.get_topic_word_dist(topic_id = i), index = mdl.used_vocabs), ignore_index = True)

  #tf-idf ranking
  vectorizer = TfidfVectorizer(vocabulary = used_vocabs)
  vectorizer.fit(X)
  idf = pd.Series(vectorizer.idf_, vectorizer.get_feature_names_out())
  topic_word_df_out = topic_word_df.mul(idf, axis = 1)

  #Labelling
  topics_for_coherence = []
  topics_for_inference = []
  for k in range(mdl.k): 
    topic_word = topic_word_df_out.iloc[k].copy()
    topics_for_coherence.append(list(topic_word[~np.isinf(topic_word)].sort_values(ascending = False).index[0:10]))
    topics_for_inference.append(topic_word[~np.isinf(topic_word)].sort_values(ascending = False).index[0])
  
  #Inference
  vectorizer = CountVectorizer(vocabulary = used_vocabs)
  noun_count = pd.DataFrame(vectorizer.fit_transform(X).toarray(), columns = vectorizer.get_feature_names_out())
  noun_count.index = X.index
  noun_count = noun_count.where(noun_count <= 1, 1)
  pseudo_prob = np.matmul(noun_count, np.asarray(topic_word_df_out.transpose()))
  pseudo_prob = pseudo_prob.div(pseudo_prob.sum(axis = 1), axis = 0)
  y = pseudo_prob.idxmax(axis = 1).apply(return_topic, args = (topics_for_inference, ))

  #coherence
  if test:
    texts = [str(i).split() for i in X]
    dictionary  = Dictionary(texts)
    corpus = [dictionary.doc2bow(i) for i in texts]
    model_tfidf = TfidfModel(corpus)
    corpus = [model_tfidf[i] for i in corpus]
    cm1 = CoherenceModel(topics = topics_for_coherence, texts = texts, corpus = corpus, dictionary = dictionary, coherence='c_v')
    print(cm1.get_coherence())
  print("4. Assign Topic: ", time.time() - start)
  return y


#####################Main#####################
start = time.time()
print("1/3 Loading Model")
nltk.download('stopwords')
print("2/3 Loading Model")
nltk.download('punkt')
print("3/3 Loading Model")
nltk.download('averaged_perceptron_tagger')

@st.cache(allow_output_mutation=True)
def load_model():
  print("1/3 Loading Model")
  detect_lang = fasttext.load_model(CWD + "model/lid.176.ftz")
  print("2/3 Loading Model")
  model_pr = joblib.load(CWD + "model/model_pr_new.sav")
  print("3/3 Loading Model")
  model_st = tf.keras.models.load_model(CWD + 'model/model_st_new.tf')
  return detect_lang, model_pr, model_st 

def word2vec_vector(word):
  vectorizer = model_st.get_layer(index = 0)
  embedding = model_st.get_layer(index = 1)
  return embedding(vectorizer(word)).numpy()

detect_lang, model_pr, model_st  = load_model()
hpt = HunposTagger(path_to_model = CWD + 'model/hunpos-1.0-linux/english.model', path_to_bin = CWD + 'model/hunpos-1.0-linux/hunpos-tag')
print(time.time() - start)

st.title("Product Sentiment Explorer")
search_query = st.text_input(label = "Enter a product")
if search_query != '':
  start = time.time()
  if DEBUG:
    search_query = "iphone" #Debug
  df = fetch_tweet(search_query, debug = False)
  df["cleaned_tweet"] = text_cleaning(df["text"])
  df["noun"], df["cleaned_tweet"] = text_preprocessing(df["cleaned_tweet"])
  df["cleaned_tweet"] = cleanup(df["cleaned_tweet"])
  df = df[pd.isna(df['cleaned_tweet']) == False].copy()
  df = df[df['cleaned_tweet'] != ""].copy()
  df = df.drop_duplicates(subset = ['cleaned_tweet'])
  df['cleaned_tweet'] = replace(df["cleaned_tweet"], reduce_redundancy, {"synonym": 0.38})
  df = assign_pr(df, model_pr, vocab = model_pr.feature_names_in_)
  df = assign_sentiment(df, model_st)

  df['noun'] = replace(df['noun'], stop_word)
  df['noun'] = replace(df['noun'], remove_not)
  df['noun'] = replace(df['noun'], unknown_word)
  df['noun'] = replace(df['noun'], stemming)
  df['noun'] = replace(df['noun'], reduce_redundancy, {"synonym": 0.20})
  df = df[pd.isna(df['noun']) == False].copy()
  df = df[df['noun'] != ""].copy()
  df = df.drop_duplicates(subset = ['noun'])

  df_pr = df[df[PRODUCT_REVIEW] == 1].copy()
  df_pr["topic"] = assign_topic(df_pr["noun"], tp.PTModel)

  df_not_pr = df[df[PRODUCT_REVIEW] == 0].copy()
  df_not_pr["topic"] = assign_topic(df_not_pr["noun"], tp.PTModel)
  print("Total:", time.time() - start)

  st.header("Proportion of Product Review Tweet with Positive Sentiment vs Topic")
  df_pr = df_pr[df[SENTIMENT].isin([0,2])][['text','topic', SENTIMENT]].copy()
  df_pr[SENTIMENT] = df_pr[SENTIMENT].replace({0:0, 2:1})
  chart_data_pr = df_pr.groupby(['topic']).mean()
  chart_data_pr = chart_data_pr.sort_values(by = [SENTIMENT], ascending = False)
  chart_data_pr = chart_data_pr.reset_index()
  st.write(alt.Chart(chart_data_pr).mark_bar().encode(
      x = alt.X('topic', sort = None),
      y = SENTIMENT
  ).properties(
    width=CHART_WIDTH,
    height=CHART_HEIGHT
  ))

  st.header("Proportion of Non-Product Review Tweet with Positive Sentiment vs Topic")
  df_not_pr = df_not_pr[df[SENTIMENT].isin([0,2])][['text','topic', SENTIMENT]].copy()
  df_not_pr[SENTIMENT] = df_not_pr[SENTIMENT].replace({0:0, 2:1})
  chart_data_not_pr = df_not_pr.groupby(['topic']).mean()
  chart_data_not_pr = chart_data_not_pr.sort_values(by = [SENTIMENT], ascending = False)
  chart_data_not_pr = chart_data_not_pr.reset_index()
  st.write(alt.Chart(chart_data_not_pr).mark_bar().encode(
      x = alt.X('topic', sort = None),
      y = SENTIMENT
  ).properties(
    width=CHART_WIDTH,
    height=CHART_HEIGHT
  ))



# Run streamlit

In [None]:
from pyngrok import ngrok
import json

with open(TWITTER_YAML, 'r') as file:
  yamljson = file.read()

yamljson = yamljson.replace("\n", "")
yamljson = yamljson.replace("\t", "")
yamljson = json.loads(yamljson)

NGROK_AUTH_TOKEN = yamljson['ngrok']['NGROK_AUTH_TOKEN']
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
public_url = ngrok.connect(port=8501)
print(public_url)
!streamlit run --server.port 80 app.py

# Shut down ngrok

In [None]:
from pyngrok import ngrok
print(ngrok.get_tunnels())
ngrok.kill() 
print(ngrok.get_tunnels())

active_tunnels = ngrok.get_tunnels()
for tunnel in active_tunnels:
  public_url = tunnel.public_url
  ngrok.disconnect(public_url)