In [1]:
# !pip install bertopic
# !pip install bertopic[visualization]
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm
2023-04-12 00:08:39.091204: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Import Model
import joblib
model = joblib.load("model_bert_new.pkl")

In [3]:
# Visualize Model
model.visualize_barchart()

In [4]:
# !pip install requests_html
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
from bs4 import BeautifulSoup

import nltk
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('omw-1.4')

from nltk import sent_tokenize
from nltk.tokenize import word_tokenize

from string import punctuation
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def get_source(url):
    try:
        session = HTMLSession()
        response = session.get(url)
        return response
    except requests.exceptions.RequestException as e:
        print(e)

def scrape(q):
    q = urllib.parse.quote_plus(q)
    result = get_source("https://www.google.com/search?q=" + q)
    lst = list(result.html.absolute_links)
    exclude_domains = ('google', 'https://google.', 'https://webcache.googleusercontent.', 'http://webcache.googleusercontent.', 'https://policies.google.',
                       'https://support.google.','https://maps.google.','https://www.instagram.','https://www.youtube.', 'facebook', 'tripadvisor')
    links = lst.copy()
    for url in lst:
        for domain in exclude_domains:
            if domain in url:
                try:
                  links.remove(url)
                except:
                  continue
                continue

    return links

def filter_text(txt):
    stop_words = ["inbox","©",":","=","@", "copyright", "cookies","..","\xa0","min","redirecting…","seconds…", "#", '()', "captcha",'redirect','anti-virus','malware','JavaScript','developer','technology','subscribe','learn more…','support us', 'articles', 'article', 'content', 'blog', '.com']
    if not txt or len(txt)<30:
        return False
    for x in stop_words:
        if x in txt.lower():
            return False
    return True

def get_text(links):
    result = []
    for url in links:
      try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        # soup = str(soup.find_all('div')).split('>')
        # main = url.split('/')
        # text = list(set(filter(lambda x:"<br" in x or "</p" in x, soup)))
        # text = list(filter(lambda x:filter_text(x), text))
        p = set(map(lambda x : x.get_text(), soup.find_all('p')))
        text = list(filter(lambda x:filter_text(x), p))
        result.extend(text)
      except:
        continue
    df = pd.DataFrame(result, columns = ['Content'])
    return df

def process_text(text):
    # remove '\n' present in the raw reviews
    text = text.replace('\n', ' ')
    # lower text
    text = text.lower()
    # split sentence into words
    token = word_tokenize(text)
    # spelling error check
    # token = [checker(x) for x in token]
    # remove punctuation
    table = str.maketrans('', '', punctuation)
    stripped = [x.translate(table) for x in token]
    # remove remaining tokens that are not alphabetic
    word = [x for x in stripped if x.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    stop_words.remove('not')
    word = [x for x in word if not x in stop_words]
    # lemmatization
    lemmatized_output = [lemmatizer.lemmatize(x) for x in word]
    # join all words into one sentence
    result = " ".join(lemmatized_output)
    return result

def clean_text(df):
    df['review_splitted'] = df['Content'].apply(sent_tokenize)
    df['cleaned'] = df['review_splitted'].apply(lambda reviews: [process_text(sentence) for sentence in reviews])
    # clean the full text
    df['cleaned_text'] = df['Content'].apply(lambda review: process_text(review))
    return df

def run(name):
    links = scrape(name + " condominium review")
    df = get_text(links)
    df = clean_text(df)
    return df

In [5]:
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForTokenClassification, AutoModelForSequenceClassification
from transformers import AutoConfig
from bertopic import BERTopic
import joblib
from tqdm import tqdm
tqdm.pandas()

def sentiment_analysis(sentence):
    '''
    Predict sentiment for each sentence

    Parameters
    ----------
    sentence: list of sentences to predict

    Results
    -------
    results: llist of dictionaries, result from the sentiment analysis model
    '''
    # fine tuned model from yelp dataset
    finetuned_model = "potatobunny/results-yelp"  # from huggingface repo
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', config=AutoConfig.from_pretrained(finetuned_model), padding=True, truncation=True)
    model = AutoModelForSequenceClassification.from_pretrained(finetuned_model)
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    results = classifier(sentence)
    return results

def get_sentiment(df):
    '''
    Get dataframe with sentiment

    Parameters
    ----------
    df: DataFrame result from run_scrape

    Results
    -------
    final_df: DataFrame with 2 columns, Review: list of sentences that has been cleaned, Sentiment: the sentiment results for the list of sentences
    '''
    map_label = {'LABEL_1': 'Positive', 'LABEL_0': 'Negative'}

    reviews = df['cleaned'].drop_duplicates().dropna().to_list()
    sa_result = []
    for sentences in tqdm(reviews):
        # get sentiment for each sentences
        res = sentiment_analysis(sentences)
        sa_result.append(res)

    final_df = pd.DataFrame(pd.Series(reviews), columns = ['Review'])
    final_df['Sentiment'] = pd.Series(sa_result)

    # map label to 'Positive' and 'Negative'
    final_df['Sentiment Score'] = final_df['Sentiment'].progress_apply(lambda sa: [res['score'] for res in sa])
    final_df['Sentiment'] = final_df['Sentiment'].progress_apply(lambda sa: [map_label[label['label']] for label in sa])
    return final_df


def predict_topic(sentences, model):
    '''
    Get topic for each sentence

    Parameters
    ----------
    sentences: list of sentences
    model: pre-trained model

    Results
    -------
    lst_topics: list of topics corresponding to the sentences
    '''
    lst_topics = []
    for i in sentences:
        idx = model.find_topics(i, top_n = 1)[0][0]
        topics = ''
        if idx == 1:
            topics = "Environment"
        elif idx == 0:
            topics = "Location"
        elif idx == 2:
            topics = "Service and Management"
        elif (idx == 3) or (idx == 6):
            topics = "Facilities"
        elif (idx == 4) or (idx == 5):
            topics = "Financial Value"
        lst_topics.append(topics)
    return lst_topics

def get_topics(df, model):
    '''
    Get dataframe with sentiment

    Parameters
    ----------
    df: DataFrame result from get_sentiment

    Results
    -------
    df: DataFrame appended with 1 column, list of topics from predict_topic
    '''
    df['Topics'] = df['Review'].progress_apply(lambda x: predict_topic(x, model))
    return df

def run_analysis(df):
    '''
    Get dataframe with sentiment and topics

    Parameters
    ----------
    df: DataFrame from run_scrape

    Results
    -------
    final_result: DataFrame with 3 columns, list of cleaned sentences, list of sentiment, and list of topics
    '''
    sa_result = get_sentiment(df)
    print('\nSentiment Analysis - done')
    final_result = get_topics(sa_result, model)
    print('\nTopic modelling - done')
    return final_result

In [6]:
# List of Condo to Scrape
train_df = pd.read_csv("Data/Reviews/condo_missing_values.csv") # file to upload
lst = train_df["Condominium Name"].unique()
lst[0]

'Hana'

In [7]:
def get_final_score(result):
  index = {'Location': 0, 'Facilities': 1, 'Financial Value': 2, 'Service and Management': 3, 'Environment': 4}
  lstOfScore = [[], [], [], [], []]
  final_score = []
  for idx in range (len(result)):
    row = result.iloc[idx]
    for j in range (len(row['Topics'])):
      if (row['Topics'][j] == ""):
        continue
      try:
        if (row['Sentiment'][j] == 'Positive'):
          lstOfScore[index[row['Topics'][j]]].append(row['Sentiment Score'][j])
        else:
          lstOfScore[index[row['Topics'][j]]].append(-1*row['Sentiment Score'][j])
      except:
        continue
  for x in lstOfScore:
    try:
      final_score.append(sum(x)/len(x))
    except:
      final_score.append(0)
  return final_score

In [8]:
# Variable to store
score = []

In [None]:
# df = run("Central Green")
# result = run_analysis(df)
# result

100%|██████████| 66/66 [02:31<00:00,  2.30s/it]
100%|██████████| 66/66 [00:00<00:00, 70564.38it/s]
100%|██████████| 66/66 [00:00<00:00, 69887.42it/s]



Sentiment Analysis - done


100%|██████████| 66/66 [00:01<00:00, 33.85it/s]



Topic modelling - done


Unnamed: 0,Review,Sentiment,Sentiment Score,Topics
0,[agiou titou heraklio greece excellent locatio...,[Positive],[0.9996849298477173],[]
1,[child age welcome],[Positive],[0.9996901750564575],[Environment]
2,[crib subject availability],[Positive],[0.998102605342865],[]
3,[spacious spotless tastefully decorated apartm...,"[Positive, Positive, Positive, Positive, Posit...","[0.9996845722198486, 0.9996802806854248, 0.999...","[Location, Environment, Facilities, Facilities..."
4,[cancellation prepayment policy vary according...,"[Negative, Positive]","[0.9969967603683472, 0.9996849298477173]","[Financial Value, Financial Value]"
...,...,...,...,...
61,[room rental chinatown mrt],[Positive],[0.912261426448822],[Location]
62,[last room big clean cozy nice condo master ro...,[Positive],[0.999685525894165],[Location]
63,[sign property account search private transact...,[Positive],[0.9996176958084106],[Financial Value]
64,[irwell hill residence walk orchard road great...,"[Positive, Positive]","[0.9996856451034546, 0.9996798038482666]","[Location, Location]"


In [9]:
for i in lst[333:]: # Change here to get other condos
  df = run(i)
  result = run_analysis(df)
  score.append(get_final_score(result))    

  3%|▎         | 1/36 [00:04<02:31,  4.34s/it]

In [None]:
lstResult = []
for x in range (len(score)):
  temp = []
  temp.append(lst[x])
  if (score[x] == [0]):
    temp.extend([0, 0, 0, 0, 0])
  else:
    temp.extend(score[x])
  lstResult.append(temp)
lstResult

[['Simei Green Condominium',
  0.988912308216095,
  0,
  0.5546636382738749,
  -0.30900539954503375,
  0],
 ['Valley Park',
  0.6692292142887505,
  0.47386491999906655,
  0.4563080195118399,
  0.37702588737010956,
  0.9964307347933451],
 ['J Gateway',
  0.7125887402466365,
  0.28542467951774597,
  0.21435705216034598,
  0.04862618843714396,
  0.15385537488119944],
 ['SunGlade',
  0.5962262948354086,
  0,
  0.46286771032545304,
  -0.1859258770942688,
  0.9996846914291382],
 ['Vacanza @ East',
  0.9987764835357666,
  0,
  0.3338143825531006,
  -0.3259715636571248,
  0]]

In [None]:
review = pd.DataFrame(lstResult)
review.columns = ['Condominium Name', 'Location', 'Facilities', 'Financial Value', 'Service and Management', 'Environment']
review.to_csv("review_condo_333.csv") # get csv to download

# Check Accuracy (Can Ignore)

In [None]:
manual = pd.read_csv("/content/Manual_Review_IS4242 - short_manual_review.csv")
manual.head(5)

Unnamed: 0,Reviews,Topics,Sentiment
0,kid studying bukit timah area property far eno...,Location,0.0
1,terrible condo unimpressive entrance,Facilities,-1.0
2,lot noise road,Environment,-1.0
3,not expect much peace quiet,Environment,-1.0
4,traffic abysmal well,Environment,-1.0


In [None]:
topicTrue = 0
neutral = 0
errCount = 0
sentimentTrue = 0
errSent = 0
for i in range (900):
  row = manual.iloc[i]
  sentence = row["Reviews"]
  topic = row["Topics"]
  score = row["Sentiment"]
  if (topic == "Other"):
    neutral += 1
    continue
  try:
    idx = model.find_topics(sentence, top_n = 1)[0][0]
    predScore = sentiment_analysis(sentence)
  except:
    errCount += 1
    continue
  predTopic = ""
  if idx == 1:
      predTopic = "Environment"
  elif idx == 0:
      predTopic = "Location"
  elif idx == 2:
      predTopic = "Service and Management"
  elif (idx == 3) or (idx == 6):
      predTopic = "Facilities"
  elif (idx == 4) or (idx == 5):
      predTopic = "Investment Value"
  if (predTopic == topic):
    topicTrue += 1
  predScore = 0
  adjustedScore = 0
  if (predScore < 0):
    adjustedScore = -1
  elif (predScore > 0):
    adjustedScore = 1
  if (adjustedScore == predScore):
    sentimentTrue += 1
print(topicTrue)
print("Topic", topicTrue*100/(900-errCount-neutral))
print(sentimentTrue)
print("Sent", sentimentTrue*100/(900-errSent))