In [None]:
!pip install bertopic
!pip install bertopic[visualization]
from bertopic import BERTopic
# Import Model
import joblib
model = joblib.load("model_bert_new.pkl")

In [None]:
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForTokenClassification, AutoModelForSequenceClassification
from transformers import AutoConfig
from bertopic import BERTopic
import joblib
from tqdm import tqdm
tqdm.pandas()

def sentiment_analysis(sentence):
    '''
    Predict sentiment for each sentence

    Parameters
    ----------
    sentence: list of sentences to predict

    Results
    -------
    results: llist of dictionaries, result from the sentiment analysis model
    '''
    # fine tuned model from yelp dataset
    finetuned_model = "potatobunny/results-yelp"  # from huggingface repo
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', config=AutoConfig.from_pretrained(finetuned_model), padding=True, truncation=True)
    model = AutoModelForSequenceClassification.from_pretrained(finetuned_model)
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    results = classifier(sentence)
    return results

def get_sentiment(df):
    '''
    Get dataframe with sentiment

    Parameters
    ----------
    df: DataFrame result from run_scrape

    Results
    -------
    final_df: DataFrame with 2 columns, Review: list of sentences that has been cleaned, Sentiment: the sentiment results for the list of sentences
    '''
    map_label = {'LABEL_1': 'Positive', 'LABEL_0': 'Negative'}

    reviews = df['cleaned'].drop_duplicates().dropna().to_list()
    sa_result = []
    for sentences in tqdm(reviews):
        # get sentiment for each sentences
        res = sentiment_analysis(sentences)
        sa_result.append(res)

    final_df = pd.DataFrame(pd.Series(reviews), columns = ['Review'])
    final_df['Sentiment'] = pd.Series(sa_result)

    # map label to 'Positive' and 'Negative'
    final_df['Sentiment Score'] = final_df['Sentiment'].progress_apply(lambda sa: [res['score'] for res in sa])
    final_df['Sentiment'] = final_df['Sentiment'].progress_apply(lambda sa: [map_label[label['label']] for label in sa])
    return final_df


def predict_topic(sentences, model):
    '''
    Get topic for each sentence

    Parameters
    ----------
    sentences: list of sentences
    model: pre-trained model

    Results
    -------
    lst_topics: list of topics corresponding to the sentences
    '''
    lst_topics = []
    for i in sentences:
        idx = model.find_topics(i, top_n = 1)[0][0]
        topics = ''
        if idx == 1:
            topics = "Environment"
        elif idx == 0:
            topics = "Location"
        elif idx == 2:
            topics = "Service and Management"
        elif (idx == 3) or (idx == 6):
            topics = "Facilities"
        elif (idx == 4) or (idx == 5):
            topics = "Financial Value"
        lst_topics.append(topics)
    return lst_topics

def get_topics(df, model):
    '''
    Get dataframe with sentiment

    Parameters
    ----------
    df: DataFrame result from get_sentiment

    Results
    -------
    df: DataFrame appended with 1 column, list of topics from predict_topic
    '''
    df['Topics'] = df['Review'].progress_apply(lambda x: predict_topic(x, model))
    return df

def run_analysis(df):
    '''
    Get dataframe with sentiment and topics

    Parameters
    ----------
    df: DataFrame from run_scrape

    Results
    -------
    final_result: DataFrame with 3 columns, list of cleaned sentences, list of sentiment, and list of topics
    '''
    sa_result = get_sentiment(df)
    print('\nSentiment Analysis - done')
    final_result = get_topics(sa_result, model)
    print('\nTopic modelling - done')
    return final_result

In [None]:
import pandas as pd
manual = pd.read_csv("/content/Manual_Review_IS4242 - short_manual_review.csv")
manual = manual[manual["Sentiment"]!=0]

In [None]:
# Evaluate Sentiment
sentimentTrue = 0
sentimentErr = 0
for i in range (500):
  sent = manual.iloc[i]["Reviews"]
  score = manual.iloc[i]["Sentiment"]
  try:
    result = sentiment_analysis(sent)
    print("Here")
  except:
    sentimentErr += 1
    continue
  if (score < 0 and (result[0]['label'] == 'LABEL_0')):
    sentimentTrue += 1
  elif (score > 0 and (result[0]['label'] == 'LABEL_1')):
    sentimentTrue += 1
print(sentimentTrue)
print(sentimentErr)
print(sentimentTrue / (500-sentimentErr))

In [None]:
# Evaluate Topic
topicTrue = 0
neutral = 0
errCount = 0
for i in range (500):
  row = manual.iloc[i]
  sentence = row["Reviews"]
  topic = row["Topics"]
  if (topic == "Other"):
    neutral += 1
    continue
  try:
    idx = model.find_topics(sentence, top_n = 1)[0][0]
  except:
    errCount += 1
    continue
  predTopic = ""
  if idx == 1:
      predTopic = "Environment"
  elif idx == 0:
      predTopic = "Location"
  elif idx == 2:
      predTopic = "Service and Management"
  elif (idx == 3) or (idx == 6):
      predTopic = "Facilities"
  elif (idx == 4) or (idx == 5):
      predTopic = "Investment Value"
  if (predTopic == topic):
    topicTrue += 1
print(topicTrue)
print("Topic", topicTrue/(500-errCount-neutral))