In [10]:
#!pip install gradio
# Basic Libraries
## make sure python version 3.8 and above
#!pip install snscrape
#!pip install -q -U "tensorflow-text==2.8.*"
#!pip install tensorflow-text

In [11]:
import snscrape.modules.twitter as sntwitter
import numpy as np
import pandas as pd
import os
import pickle
import gradio as gr
import regex as re
import string

# preprocessing 
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from langdetect import detect
from langdetect import DetectorFactory

# bert
import tensorflow as tf
import tensorflow_text as text

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [12]:
# verify that model is the same as what we trained

# Sraping
- Take search keywords and number of entries
- Return list of strings

In [13]:
#Scrapper for the top games
def scraping_game(start, end, game, amount):
    tweets_df = {}
    
    # Creating list to append tweet data to
    tweets_list = []

    try:
        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('%s lang:en since:%s until:%s' %(game,start,end)).get_items()):
            if i>(amount-1):
                break

            tweets_list.append([game, tweet.date, tweet.id, tweet.content, tweet.retweetCount, tweet.likeCount, tweet.user.username])

    except Exception:
        print(Exception)

    # Creating a dataframe from the tweets list above
    tweets_df[game] = pd.DataFrame(tweets_list, columns=['Game','Datetime', 'TweetId', 'Text', 'RetweetCount', 'LikeCount','Username'])
    print("Finish Scraping %s for %s" %(len(tweets_df[game]), game))
    
    #Concat dict df into one df
    # print(tweets_df)
    new_df = pd.concat(tweets_df.values(), ignore_index=True)
    return new_df['Text']

# Model
- Reloading Model
- Preprocessing Inputs: stop after stopwords removal (no stemming/lemmatizing)
    - We observe from model training that stemmed/lemmatized text doesn't give SVM models significant better results.So here we are just using cleaned texts
- Give predictions

In [14]:
# reload model
BERT_SUB_PATH = './bert_subjectivity_model'
BERT_POLAR_PATH = './bert_polarity_model'
SVM_SUB_FILE = 'svm_subjectivity_model.sav'
SVM_POLAR_FILE = 'svm_polarity_model.sav'

BERT_SUB = None
BERT_POLAR = None
SVM_SUB = None
SVM_POLAR = None

def reload_model():
    BERT_SUB = tf.saved_model.load(BERT_SUB_PATH)
    BERT_POLAR = tf.saved_model.load(BERT_POLAR_PATH)
    SVM_SUB = pickle.load(open(SVM_SUB_FILE, 'rb'))
    SVM_POLAR = pickle.load(open(SVM_POLAR_FILE, 'rb'))

In [16]:
# set seed
DetectorFactory.seed = 0
nltk.download('stopwords')
nltk.download('punkt')

stop = stopwords.words('english')
additional_stopwords = ["'s","...","'ve","``","''","'m",'--',"'ll","'d", 'u', 'b', 'c', 'd', 'x', 'xf', 'f', 'p', 'xb']
stop = set(stop + additional_stopwords)

def language_detection(x:str):
    text = x.split(" ")
    
    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        except Exception as e:
            lang = "unknown"
            pass
    return lang


def clean_text(text):
    text = str(text)
    text = re.sub(r'[^a-zA-Z ]+', ' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    # text = re.sub(r'pic.twitter\S+', ' ', text)
    text = re.sub(r'#', '', text)
    text = text.lower()

    return text

def decontracted(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"it\'s", "it is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\“", "", text)
    text = re.sub(r"\”", "", text)
    text = re.sub(r"\…", "", text)

    return text


def remove_punc(tweet):
    tweet =  tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = ' '.join([word for word in tweet.split()])
    tweet = tweet.lower()
    
    return tweet


def remove_stopwords(x):
    x = word_tokenize(x)
    store_words = ''
    
    for i in x:
        if i not in stop:
            store_words += i + ' '
            
    return store_words


def pre_process(tweet):
    if language_detection(tweet) != 'en':
        return None         # suggesting not english language and cannot give predictions
    return remove_stopwords(remove_punc(decontracted(clean_text(tweet))))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ytchen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ytchen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
def bert_predict(cleaned_text:list):
    result_bert_sub = BERT_SUB.predict(cleaned_text)
    result_bert_polar = BERT_POLAR.predict(cleaned_text)
    return result_bert_sub, result_bert_polar

def svm_predict(cleaned_text:list):
    result_svm_sub = SVM_SUB.predict(cleaned_text)
    result_svm_polar = SVM_POLAR.predict(cleaned_text)
    return result_svm_sub, result_svm_polar

In [None]:
#function for analyzing overall sentiment
def analyze_sentiment(sentiment):
    if sentiment < 0.25:
        overall_sentiment = "very negative"
    elif sentiment < 0.5:
        overall_sentiment = "negative"
    elif sentiment < 0.75:
        overall_sentiment = "positive"
    else:
        overall_sentiment = "very positive"
    return overall_sentiment

# Main Function

In [20]:
def init():
    reload_model()

In [None]:
# Main NLP program
def nlp(game_title, scrap_no, activate_scrape, algorithm_choice):
    
    #insert function to scrape twitter for game title here
    scrapped_tweets_raw = scraping_game("2022-01-01", "2022-11-04", game_title, scrap_no)
    #insert preprocessing function here
    
    cleaned_text = [pre_process(text) for text in scrapped_tweets_raw]
    cleaned_text = [len(text.split())>1 for text in cleaned_text]
    
    #Reviewing overall sentiment
    sentiment = 0.22
    
    #Check which models to run 
    #NOTE INSERT PREDICTOR MODEL AS LABELED
    if 'Bert' in algorithm_choice:
        #Insert Bert model predictor here
        result_bert_sub, result_bert_polar = bert_predict(cleaned_text) # return individual results
        bert_sentiment_result = f"BERT returned {len(result_bert_sub)} results, average score = {np.mean(result_bert_sub)}"
        bert_polarity_result = f"BERT model returned {len(result_bert_polar)} results, average score = {np.mean(result_bert_polar)}"
    else:
        bert_sentiment_result, bert_polarity_result = "Bert model not being run"
        
    #Repeat for SVM model
    if 'SVM' in algorithm_choice:
        #Insert SVM model predictor here
        result_svm_sub, result_svm_polar = svm_predict(cleaned_text)
        SVM_sentiment_result = f"SVM returned {len(result_svm_sub)} results, average score = {np.mean(result_svm_sub)}"
        SVM_polarity_result = f"SVM model returned {len(result_svm_polar)} results, average score = {np.mean(result_svm_polar)}"
    else:
        SVM_sentiment_result, SVM_polarity_result = "SVM model not being run"
    
    #if scrape checkbox is marked
    if activate_scrape:
        return {bert_sentiment : game_title + bert_sentiment_result,
                bert_polarity : bert_polarity_result,
                svm_sentiment : SVM_sentiment_result,
                svm_polarity : SVM_polarity_result,
                scraped_tweets : scrapped_tweets_raw}
    else:
        return {bert_sentiment : game_title + " " + bert_sentiment_result,
                bert_polarity : bert_polarity_result,
                svm_sentiment : SVM_sentiment_result,
                svm_polarity : SVM_polarity_result,}

##EDIT TO INCLUDE EXCEPTION

# Gradio Frontend code

In [None]:
with gr.Blocks() as demo:
    with gr.Row():
        #First Column
        with gr.Column(scale=1):
            game_title = gr.Textbox(label = "Game Title")
            
            #amount of tweets to scrape
            scrap_no = gr.Slider(0,1000, label = "Amount of tweets to scrape")
            
            #Choose to display scraped text
            activate_scrape = gr.Checkbox(label = "Show scraped data?")
            
            #Choice of algorithm
            algorithm_choice = gr.CheckboxGroup(choices = ["Bert", "SVM"]),
            
            submit_button = gr.Button("Submit")
            
            #Displays scrapped tweets if option is selected
            scraped_tweets = gr.Textbox(label = "Scraped Data")
            
        #Second Column displays all model results
        with gr.Column(scale=4):
            bert_sentiment = gr.Textbox(label = "Bert Sentiment")
            bert_polarity = gr.Textbox(label = "Bert Polarity")
            svm_sentiment = gr.Textbox(label = "SVM Sentiment")
            svm_polarity = gr.Textbox(label = "SVM Polarity")
    
    #Button to run nlp function
    submit_button.click(nlp, 
                        inputs=[game_title,scrap_no,activate_scrape,algorithm_choice[0]], 
                        outputs=[bert_sentiment,
                                bert_polarity,
                                svm_sentiment,
                                svm_polarity, 
                                scraped_tweets]
                       )
    

demo.launch()