In [None]:
!pip install bertopic
!pip install bertopic[visualization]
from bertopic import BERTopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 KB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m45.4 MB/s[0m 

In [None]:
# Import Model
import joblib
model = joblib.load("model_bert_new.pkl")

In [None]:
# Visualize Model
model.visualize_barchart()

In [None]:
!pip install requests_html
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
from bs4 import BeautifulSoup

import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('brown')
nltk.download('punkt')
nltk.download('omw-1.4')

from nltk import sent_tokenize
from nltk.tokenize import word_tokenize

from string import punctuation
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def get_source(url):
    try:
        session = HTMLSession()
        response = session.get(url)
        return response
    except requests.exceptions.RequestException as e:
        print(e)

def scrape(q):
    q = urllib.parse.quote_plus(q)
    result = get_source("https://www.google.com/search?q=" + q)
    lst = list(result.html.absolute_links)
    exclude_domains = ('google', 'https://google.', 'https://webcache.googleusercontent.', 'http://webcache.googleusercontent.', 'https://policies.google.',
                       'https://support.google.','https://maps.google.','https://www.instagram.','https://www.youtube.', 'facebook', 'tripadvisor')
    links = lst.copy()
    for url in lst:
        for domain in exclude_domains:
            if domain in url:
                try:
                  links.remove(url)
                except:
                  continue
                continue

    return links

def filter_text(txt):
    stop_words = ["inbox","©",":","=","@", "copyright", "cookies","..","\xa0","min","redirecting…","seconds…", "#", '()', "captcha",'redirect','anti-virus','malware','JavaScript','developer','technology','subscribe','learn more…','support us', 'articles', 'article', 'content', 'blog', '.com']
    if not txt or len(txt)<30:
        return False
    for x in stop_words:
        if x in txt.lower():
            return False
    return True

def get_text(links):
    result = []
    for url in links:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        # soup = str(soup.find_all('div')).split('>')
        # main = url.split('/')
        # text = list(set(filter(lambda x:"<br" in x or "</p" in x, soup)))
        # text = list(filter(lambda x:filter_text(x), text))
        p = set(map(lambda x : x.get_text(), soup.find_all('p')))
        text = list(filter(lambda x:filter_text(x), p))
        result.extend(text)
    df = pd.DataFrame(result, columns = ['Content'])
    return df

def process_text(text):
    # remove '\n' present in the raw reviews
    text = text.replace('\n', ' ')
    # lower text
    text = text.lower()
    # split sentence into words
    token = word_tokenize(text)
    # spelling error check
    # token = [checker(x) for x in token]
    # remove punctuation
    table = str.maketrans('', '', punctuation)
    stripped = [x.translate(table) for x in token]
    # remove remaining tokens that are not alphabetic
    word = [x for x in stripped if x.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    stop_words.remove('not')
    word = [x for x in word if not x in stop_words]
    # lemmatization
    lemmatized_output = [lemmatizer.lemmatize(x) for x in word]
    # join all words into one sentence
    result = " ".join(lemmatized_output)
    return result

def clean_text(df):
    df['review_splitted'] = df['Content'].apply(sent_tokenize)
    df['cleaned'] = df['review_splitted'].apply(lambda reviews: [process_text(sentence) for sentence in reviews])
    # clean the full text
    df['cleaned_text'] = df['Content'].apply(lambda review: process_text(review))
    return df

def run(name):
    links = scrape(name + " condominium review")
    df = get_text(links)
    df = clean_text(df)
    return df

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting requests_html
  Downloading requests_html-0.10.0-py3-none-any.whl (13 kB)
Collecting parse
  Downloading parse-1.19.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fake-useragent
  Downloading fake_useragent-1.1.3-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting w3lib
  Downloading w3lib-2.1.1-py3-none-any.whl (21 kB)
Collecting pyquery
  Downloading pyquery-2.0.0-py3-none-any.whl (22 kB)
Collecting pyppeteer>=0.0.14
  Downloading pyppeteer-1.0.2-py3-none-any.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.4/83.4 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting websockets<11.0,>=10.0
  Down

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForTokenClassification, AutoModelForSequenceClassification
from transformers import AutoConfig
from bertopic import BERTopic
import joblib
from tqdm import tqdm
tqdm.pandas()

def sentiment_analysis(sentence):
    '''
    Predict sentiment for each sentence

    Parameters
    ----------
    sentence: list of sentences to predict

    Results
    -------
    results: llist of dictionaries, result from the sentiment analysis model
    '''
    # fine tuned model from yelp dataset
    finetuned_model = "potatobunny/results-yelp"  # from huggingface repo
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', config=AutoConfig.from_pretrained(finetuned_model), padding=True, truncation=True)
    model = AutoModelForSequenceClassification.from_pretrained(finetuned_model)
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    results = classifier(sentence)
    return results

def get_sentiment(df):
    '''
    Get dataframe with sentiment

    Parameters
    ----------
    df: DataFrame result from run_scrape

    Results
    -------
    final_df: DataFrame with 2 columns, Review: list of sentences that has been cleaned, Sentiment: the sentiment results for the list of sentences
    '''
    map_label = {'LABEL_1': 'Positive', 'LABEL_0': 'Negative'}

    reviews = df['cleaned'].drop_duplicates().dropna().to_list()
    sa_result = []
    for sentences in tqdm(reviews):
        # get sentiment for each sentences
        res = sentiment_analysis(sentences)
        sa_result.append(res)

    final_df = pd.DataFrame(pd.Series(reviews), columns = ['Review'])
    final_df['Sentiment'] = pd.Series(sa_result)

    # map label to 'Positive' and 'Negative'
    final_df['Sentiment Score'] = final_df['Sentiment'].progress_apply(lambda sa: [res['score'] for res in sa])
    final_df['Sentiment'] = final_df['Sentiment'].progress_apply(lambda sa: [map_label[label['label']] for label in sa])
    return final_df


def predict_topic(sentences, model):
    '''
    Get topic for each sentence

    Parameters
    ----------
    sentences: list of sentences
    model: pre-trained model

    Results
    -------
    lst_topics: list of topics corresponding to the sentences
    '''
    lst_topics = []
    for i in sentences:
        idx = model.find_topics(i, top_n = 1)[0][0]
        topics = ''
        if idx == 1:
            topics = "Environment"
        elif idx == 0:
            topics = "Location"
        elif idx == 2:
            topics = "Service and Management"
        elif (idx == 3) or (idx == 6):
            topics = "Facilities"
        elif (idx == 4) or (idx == 5):
            topics = "Financial Value"
        lst_topics.append(topics)
    return lst_topics

def get_topics(df, model):
    '''
    Get dataframe with sentiment

    Parameters
    ----------
    df: DataFrame result from get_sentiment

    Results
    -------
    df: DataFrame appended with 1 column, list of topics from predict_topic
    '''
    df['Topics'] = df['Review'].progress_apply(lambda x: predict_topic(x, model))
    return df

def run_analysis(df):
    '''
    Get dataframe with sentiment and topics

    Parameters
    ----------
    df: DataFrame from run_scrape

    Results
    -------
    final_result: DataFrame with 3 columns, list of cleaned sentences, list of sentiment, and list of topics
    '''
    sa_result = get_sentiment(df)
    print('\nSentiment Analysis - done')
    final_result = get_topics(sa_result, model)
    print('\nTopic modelling - done')
    return final_result

In [None]:
# List of Condo to Scrape
train_df = pd.read_csv("df_srx_Condo.csv") # file to upload
lst = train_df["Property Name"].unique()

In [None]:
def get_final_score(result):
  index = {'Location': 0, 'Facilities': 1, 'Financial Value': 2, 'Service and Management': 3, 'Environment': 4}
  lstOfScore = [[], [], [], [], []]
  final_score = []
  for idx in range (len(result)):
    row = result.iloc[idx]
    for j in range (len(row['Topics'])):
      try:
        if (row['Sentiment'][j] == 'Positive'):
          lstOfScore[index[row['Topics'][j]]].append(row['Sentiment Score'][j])
        else:
          lstOfScore[index[row['Topics'][j]]].append(-1*row['Sentiment Score'][j])
      except:
        continue
  for x in lstOfScore:
    final_score.append(sum(x)/len(x))
  return final_score

In [None]:
# Variable to store
score = []

In [None]:
for i in lst[25:30]: # Change here to get other condos
  try:
    df = run(i)
    result = run_analysis(df)
    score.append(get_final_score(result))
  except:
    score.append([0])
    continue

  0%|          | 0/13 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 13/13 [00:40<00:00,  3.15s/it]
100%|██████████| 13/13 [00:00<00:00, 7313.03it/s]
100%|██████████| 13/13 [00:00<00:00, 10469.65it/s]



Sentiment Analysis - done


100%|██████████| 13/13 [00:00<00:00, 19.84it/s]



Topic modelling - done


 84%|████████▍ | 119/141 [04:21<00:48,  2.20s/it]


In [None]:
lstResult = []
for x in range (len(score)):
  temp = []
  temp.append(lst[x])
  if (score[x] == [0]):
    temp.extend([0, 0, 0, 0, 0])
  else:
    temp.extend(score[x])
  lstResult.append(temp)
lstResult

In [None]:
review = pd.DataFrame(lstResult)
review.columns = ['Condominium Name', 'Location', 'Facilities', 'Financial Value', 'Service and Management', 'Environment']
review.to_csv("review_condo.csv") # get csv to download