In [1]:
#!pip install gradio
# Basic Libraries
## make sure python version 3.8 and above
#!pip install snscrape
#!pip install -q -U "tensorflow-text==2.8.*"
#!pip install tensorflow-text

In [2]:
import snscrape.modules.twitter as sntwitter
import numpy as np
import pandas as pd
import os
import pickle
import gradio as gr
import regex as re
import string
from time import time

# preprocessing 
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from langdetect import detect
from langdetect import DetectorFactory

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# bert
import tensorflow as tf
import tensorflow_text as text

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Sraping
- Take search keywords and number of entries
- Return list of strings

In [3]:
#Scrapper for the top games
def scraping_game(start, end, game, amount):
    tweets_df = {}
    
    # Creating list to append tweet data to
    tweets_list = []

    try:
        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('%s lang:en since:%s until:%s' %(game,start,end)).get_items()):
            if i>(amount-1):
                break

            tweets_list.append([game, tweet.date, tweet.id, tweet.content, tweet.retweetCount, tweet.likeCount, tweet.user.username])

    except Exception:
        print(Exception)

    # Creating a dataframe from the tweets list above
    tweets_df[game] = pd.DataFrame(tweets_list, columns=['Game','Datetime', 'TweetId', 'Text', 'RetweetCount', 'LikeCount','Username'])
    print("Finish Scraping %s for %s" %(len(tweets_df[game]), game))
    
    #Concat dict df into one df
    # print(tweets_df)
    new_df = pd.concat(tweets_df.values(), ignore_index=True)
    return new_df['Text']

# Helpers
- Timer
- Sentiment Analyzer

In [4]:
def timer_func(func):
    # This function shows the execution time of 
    # the function object passed
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'{func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [5]:
#function for analyzing overall sentiment
def analyze_sentiment(sentiment, neg_hint:str, pos_hint:str):
    if sentiment < 0.25:
        overall_sentiment = f"very {neg_hint}"
    elif sentiment < 0.5:
        overall_sentiment = f"{neg_hint}"
    elif sentiment < 0.75:
        overall_sentiment = f"{pos_hint}"
    else:
        overall_sentiment = f"very {pos_hint}"
    return overall_sentiment

# Model
- Reloading Model
- Preprocessing Inputs: stop after stopwords removal (no stemming/lemmatizing)
    - We observe from model training that stemmed/lemmatized text doesn't give SVM models significant better results.So here we are just using cleaned texts
- Give predictions

In [6]:
# reload model
BERT_SUB = tf.saved_model.load('./bert_subjectivity_model')
BERT_POLAR = tf.saved_model.load('./bert_polarity_model')
SVM_SUB = pickle.load(open('svm_subjectivity_model.sav', 'rb'))
SVM_POLAR = pickle.load(open('svm_polarity_model.sav', 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
@timer_func
def bert_predict(cleaned_text:list):
    result_bert_sub = BERT_SUB(cleaned_text)
    result_bert_polar = BERT_POLAR(cleaned_text)
    return np.array(tf.sigmoid(result_bert_sub))[:,0], np.array(tf.sigmoid(result_bert_polar))[:,0]

@timer_func
def svm_predict(cleaned_text:list):
    cv = CountVectorizer(ngram_range=(1,2), max_features=500).fit_transform(cleaned_text)
    tfidf_texts = TfidfTransformer(use_idf=True).fit_transform(cv)
    result_svm_sub = SVM_SUB.predict(tfidf_texts)
    result_svm_polar = SVM_POLAR.predict(tfidf_texts)
    return result_svm_sub, result_svm_polar

In [8]:
# set seed
DetectorFactory.seed = 0
nltk.download('stopwords')
nltk.download('punkt')

stop = stopwords.words('english')
additional_stopwords = ["'s","...","'ve","``","''","'m",'--',"'ll","'d", 'u', 'b', 'c', 'd', 'x', 'xf', 'f', 'p', 'xb']
stop = set(stop + additional_stopwords)

def language_detection(x:str):
    text = x.split(" ")
    
    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        except Exception as e:
            lang = "unknown"
            pass
    return lang


def clean_text(text):
    text = str(text)
    text = re.sub(r'[^a-zA-Z ]+', ' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    # text = re.sub(r'pic.twitter\S+', ' ', text)
    text = re.sub(r'#', '', text)
    text = text.lower()

    return text

def decontracted(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"it\'s", "it is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\“", "", text)
    text = re.sub(r"\”", "", text)
    text = re.sub(r"\…", "", text)

    return text


def remove_punc(tweet):
    tweet =  tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = ' '.join([word for word in tweet.split()])
    tweet = tweet.lower()
    
    return tweet


def remove_stopwords(x):
    x = word_tokenize(x)
    store_words = ''
    
    for i in x:
        if i not in stop:
            store_words += i + ' '
            
    return store_words


def pre_process(tweet):
    if language_detection(tweet) != 'en':
        return None         # suggesting not english language and cannot give predictions
    return remove_stopwords(remove_punc(decontracted(clean_text(tweet))))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Main Function

In [30]:
# Main NLP program
def nlp(game_title, scrap_no, activate_scrape, algorithm_choice):
    
    #insert function to scrape twitter for game title here
    scrapped_tweets_raw = scraping_game("2022-01-01", "2022-11-04", game_title, scrap_no)
    #insert preprocessing function here
    print("Preprocessing texts now...")
    cleaned_text = [pre_process(text) for text in scrapped_tweets_raw]
    cleaned_text = [text for text in cleaned_text if text and
                    len(text.split())>1]
    
    #Check which models to run 
    #NOTE INSERT PREDICTOR MODEL AS LABELED
    if 'Bert' in algorithm_choice:
        print("Running Bert now...")
        #Insert Bert model predictor here
        result_bert_sub, result_bert_polar = bert_predict(cleaned_text) # return individual results
        bert_sub_avg = np.mean(result_bert_sub)
        bert_polar_avg = np.mean(result_bert_polar)
        bert_sentiment_result = f"average score = {bert_sub_avg}, {analyze_sentiment(bert_sub_avg, 'neutral', 'subjective')}"
        bert_polarity_result = f"average score = {bert_polar_avg}, {analyze_sentiment(bert_polar_avg, 'negative', 'positive')}"
    else:
        bert_sentiment_result, bert_polarity_result = "Bert model not being run"
    
    
    #Repeat for SVM model
    if 'SVM' in algorithm_choice:
        print("Running SVM now...")
        #Insert SVM model predictor here
        result_svm_sub, result_svm_polar = svm_predict(cleaned_text)
        SVM_sub_avg = np.mean(result_svm_sub)
        SVM_polar_avg = np.mean(result_svm_polar)
        SVM_sentiment_result = f"average score = {SVM_sub_avg}, {analyze_sentiment(SVM_sub_avg, 'neutral', 'subjective')}"
        SVM_polarity_result = f"average score = {SVM_polar_avg}, {analyze_sentiment(SVM_polar_avg, 'negative', 'positive')}"
    else:
        SVM_sentiment_result, SVM_polarity_result = "SVM model not being run"
        
    print("Returning message")
    #if scrape checkbox is marked
    if activate_scrape:
        result_df = pd.DataFrame({'Cleaned Text':cleaned_text,
                                               'Bert Subjectivity':result_bert_sub,
                                               'Bert Polarity':result_bert_polar,
                                               'SVM Subjectivity':result_svm_sub,
                                               'SVM Polarity':result_svm_polar})
        return {bert_sentiment : game_title + bert_sentiment_result,
                bert_polarity : bert_polarity_result,
                svm_sentiment : SVM_sentiment_result,
                svm_polarity : SVM_polarity_result, 
                scraped_tweets: result_df}
    else:
        return {bert_sentiment : game_title + " " + bert_sentiment_result,
                bert_polarity : bert_polarity_result,
                svm_sentiment : SVM_sentiment_result,
                svm_polarity : SVM_polarity_result}

##EDIT TO INCLUDE EXCEPTION

# Gradio Frontend code

In [31]:
with gr.Blocks() as demo:
    with gr.Row():
        #First Column
        with gr.Column(scale=1):
            game_title = gr.Textbox(label = "Game Title")
            
            #amount of tweets to scrape
            scrap_no = gr.Slider(200,1000, label = "Amount of tweets to scrape") #SVM requires more thajn 500 features
            
            #Choose to display scraped text
            activate_scrape = gr.Checkbox(label = "Show scraped data?")
            
            #Choice of algorithm
            algorithm_choice = gr.CheckboxGroup(choices = ["Bert", "SVM"]),
            
            submit_button = gr.Button("Submit")
            
        #Second Column displays all model results
        with gr.Column(scale=4):
            bert_sentiment = gr.Textbox(label = "Bert Sentiment")
            bert_polarity = gr.Textbox(label = "Bert Polarity")
            svm_sentiment = gr.Textbox(label = "SVM Sentiment")
            svm_polarity = gr.Textbox(label = "SVM Polarity")
            
    with gr.Row():       
        #Displays scrapped tweets if option is selected
        scraped_tweets = gr.DataFrame(label = "Scraped Data", headers=['Cleaned Text',
                                               'Bert Subjectivity',
                                               'Bert Polarity',
                                               'SVM Subjectivity',
                                               'SVM Polarity'])
    #Button to run nlp function
    submit_button.click(nlp, 
                        inputs=[game_title,scrap_no,activate_scrape,algorithm_choice[0]], 
                        outputs=[bert_sentiment,
                                 bert_polarity,
                                 svm_sentiment,
                                 svm_polarity,
                                 scraped_tweets]
                       )

demo.launch()


Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB
Running on local URL:  http://127.0.0.1:7872

To create a public link, set `share=True` in `launch()`.


(<gradio.routes.App at 0x22416d5e820>, 'http://127.0.0.1:7872/', None)

Finish Scraping 200 for pubg
Preprocessing texts now...
Running Bert now...
'bert_predict' executed in 15.4881s
Running SVM now...
'svm_predict' executed in 0.3030s
Returning message
Finish Scraping 200 for pubg
Preprocessing texts now...
Running Bert now...
'bert_predict' executed in 13.5410s
Running SVM now...
'svm_predict' executed in 0.2000s
Returning message
