# Set up Environment

In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q -U "tensorflow-text==2.9.*"
!pip install -q tf-models-official
!pip install opencv-python
!pip install transformers
!pip install tokenizer

[K     |████████████████████████████████| 4.6 MB 29.6 MB/s 
[K     |████████████████████████████████| 511.7 MB 6.5 kB/s 
[K     |████████████████████████████████| 438 kB 63.9 MB/s 
[K     |████████████████████████████████| 1.6 MB 59.7 MB/s 
[K     |████████████████████████████████| 5.8 MB 57.8 MB/s 
[K     |████████████████████████████████| 2.1 MB 31.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 50.0 MB/s 
[K     |████████████████████████████████| 352 kB 75.9 MB/s 
[K     |████████████████████████████████| 43 kB 2.5 MB/s 
[K     |████████████████████████████████| 99 kB 11.5 MB/s 
[K     |████████████████████████████████| 237 kB 63.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 67.0 MB/s 
[K     |████████████████████████████████| 48.3 MB 1.2 MB/s 
[K     |████████████████████████████████| 636 kB 56.1 MB/s 
[K     |████████████████████████████████| 92 kB 13.9 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel f

In [3]:
## for data
import pandas as pd
import numpy as np
from sklearn import metrics, manifold
from tqdm import tqdm

## for processing
import re
import nltk
from nltk import wordnet 
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for w2v
import gensim
import gensim.downloader as gensim_api

## for BERT
import transformers
import os, sys, collections
import tensorflow as tf

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [4]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

# Process dataset

In [5]:
import pandas as pd

data = pd.read_csv(r'/content/drive/MyDrive/SMU_MITB_NLP/project/airbnb_labelled.csv')

In [6]:
data

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,Review,Category,Sentiment
0,11391287,2.081364e+08,31/10/17,68485599,Ali,Fantastic location and fantastic hosts for a v...,location,positive
1,50319142,5.744510e+17,3/3/22,81633638,Christophe,Super happy with this stylish and comfy apartm...,communicate,positive
2,272282,5.281465e+07,2/11/15,8528459,Rebecca,Mar made us feel very welcomed with great dire...,communicate,positive
3,4683548,4.758410e+17,18/10/21,190705793,Ron,Andres is an amazing host! <br/>Not only the a...,accurate,positive
4,7838608,1.923088e+08,10/9/17,12034187,Giuseppe,A lovely room which was spacious and clean. Al...,cleanliness,positive
...,...,...,...,...,...,...,...,...
295,19154124,6.235263e+08,1/5/20,261715322,Ivan,"Super host, super flat",value,positive
296,5645973,3.043967e+08,9/8/18,164747779,Arielle,We had a wonderful time in Barcelona! <br/>The...,value,positive
297,23233260,4.518442e+08,12/5/19,242067326,Sophie,"Beautifully modern and stylish apartment, nice...",location,positive
298,1916224,1.953741e+08,19/9/17,93750103,Scott,This spot is great for 3-4 people or less. Ex...,location,positive


# Generate out embedding

word tokenize

In [7]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    '''
    Preprocess a string.
    :parameter
        :param text: string - name of column containing text
        :param lst_stopwords: list - list of stopwords to remove
        :param flg_stemm: bool - whether stemming is to be applied
        :param flg_lemm: bool - whether lemmitisation is to be applied
    :return
        cleaned text
    '''

    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    print(text)
    text = re.sub(r'[^A-z\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    print(lst_text)
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [8]:
data_word2vec = pd.read_csv('/content/drive/MyDrive/SMU_MITB_NLP/project/cleaned_reviews_prelim_v2.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
data_word2vec.head()

Unnamed: 0.1,Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,comments_stopwordsrm,comments_stopwordsrm_lemma
0,0,18674,4808211,2013-05-27,4841196.0,Caron,"Great location. Clean, spacious flat. Would re...",great location clean spacious flat would recom...,great location clean spacious flat would recom...
1,2,18674,41087522,2015-08-04,35231385.0,Shlomi,"Big apartment, well equipped.\r<br/>Very good ...",big apartment well equipped very good service ...,big apartment well equip very good service exc...
2,3,18674,81000756,2016-06-20,23223644.0,Joost,The Check in was fast and flexible. The price ...,the check fast flexible the price fair flat is...,the check fast flexible the price fair flat is...
3,4,18674,278588962,2018-06-18,4756672.0,Marius,Great location and enough space in the apartme...,great location enough space apartment people a...,great location enough space apartment people a...
4,5,18674,408388148,2019-02-04,118847863.0,Sonora,Our experience here was mixed. <br/>The locati...,our experience mixed the location flat super c...,-PRON- experience mix the location flat super ...


In [9]:
lst_stopwords = nltk.corpus.stopwords.words("english")

In [10]:
data['review_clean'] = data['Review'].apply(lambda x: preprocess_text(x))
data.head()

Fantastic location and fantastic hosts for a very reasonable price, would definitely use again!
['fantastic', 'location', 'and', 'fantastic', 'hosts', 'for', 'a', 'very', 'reasonable', 'price', 'would', 'definitely', 'use', 'again']
Super happy with this stylish and comfy apartment for our family of 5. Easy check-in, hospitable service, great amenities and situated in a beautiful building. Loved our stay and we'll be back!
['super', 'happy', 'with', 'this', 'stylish', 'and', 'comfy', 'apartment', 'for', 'our', 'family', 'of', 'easy', 'checkin', 'hospitable', 'service', 'great', 'amenities', 'and', 'situated', 'in', 'a', 'beautiful', 'building', 'loved', 'our', 'stay', 'and', 'well', 'be', 'back']
Mar made us feel very welcomed with great directions, guestbook and assistance before our arrival.  We loved the residential feel to the building and the neighbourhood, with a train located just around the corner but we walked every where and great to get away from the crowds. 
<br/>
<br/>The 

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,Review,Category,Sentiment,review_clean
0,11391287,208136400.0,31/10/17,68485599,Ali,Fantastic location and fantastic hosts for a v...,location,positive,fantastic location and fantastic host for a ve...
1,50319142,5.74451e+17,3/3/22,81633638,Christophe,Super happy with this stylish and comfy apartm...,communicate,positive,super happy with this stylish and comfy apartm...
2,272282,52814650.0,2/11/15,8528459,Rebecca,Mar made us feel very welcomed with great dire...,communicate,positive,mar made u feel very welcomed with great direc...
3,4683548,4.75841e+17,18/10/21,190705793,Ron,Andres is an amazing host! <br/>Not only the a...,accurate,positive,andres is an amazing host brnot only the apart...
4,7838608,192308800.0,10/9/17,12034187,Giuseppe,A lovely room which was spacious and clean. Al...,cleanliness,positive,a lovely room which wa spacious and clean also...


In [16]:
glove = gensim_api.load("glove-wiki-gigaword-300")



In [17]:
## create annotated labels first
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [None]:
data['review_clean']

0      fantastic location and fantastic host for a ve...
1      super happy with this stylish and comfy apartm...
2      mar made u feel very welcomed with great direc...
3      andres is an amazing host brnot only the apart...
4      a lovely room which wa spacious and clean also...
                             ...                        
295                                super host super flat
296    we had a wonderful time in barcelona brthe apa...
297    beautifully modern and stylish apartment nice ...
298    this spot is great for people or le extremely ...
299    amazing stay thanks so much everything wa perfect
Name: review_clean, Length: 300, dtype: object

In [None]:
import re
#remove numbers from sentence
sent_input = []
for sent in data_word2vec['comments_stopwordsrm_lemma']:
    #tokens = sent.split()
    text_only = re.sub(r'[^a-z|\s]', "", str(sent))
    sent_input.append(text_only.split())

In [None]:
len(sent_input)

395340

In [None]:
#Train a custom model 
word2vec_model = Word2Vec(sentences=sent_input, window=5, min_count=1, workers=4)

In [None]:
seed_word_list100={}

for cat in ['neutral', 'positive', 'negative']:
    seed_word_list100[cat] = word2vec_model.wv.most_similar(cat, topn=20)

In [None]:
neutral_words = []

for key in seed_word_list100['neutral']:
  x, y = key
  neutral_words.append(x)

print(neutral_words)

['bland', 'amenitiesworth', 'midcentury', 'renewal', 'scant', 'negligence', 'towelscushion', 'apron', 'celing', 'grit', 'eyecatche', 'finishing', 'styling', 'hermoso', 'eccentric', 'wellmanaged', 'fixin', 'decorationsthe', 'glowinthedark', 'akin']


In [None]:
positive_words = []

for key in seed_word_list100['positive']:
  x, y = key
  positive_words.append(x)

print(positive_words)

['negative', 'deserve', 'emotion', 'bad', 'fivestar', 'firstly', 'honest', 'contrary', 'superlative', 'dislike', 'extraordinary', 'horrible', 'constructive', 'unpleasant', 'disappointing', 'comment', 'remark', 'spoil', 'unpleasent', 'superhost']


In [None]:
negative_words = []

for key in seed_word_list100['negative']:
  x, y = key
  negative_words.append(x)

print(negative_words)

['minus', 'constructive', 'flaw', 'criticism', 'downside', 'remark', 'positive', 'drawback', 'downfall', 'gripe', 'bad', 'critique', 'disadvantage', 'improve', 'caveat', 'dislike', 'complaint', 'annoyance', 'improvement', 'note']


In [14]:
def get_similar_words(lst_words, top, nlp):
    lst_out = lst_words
    for tupla in nlp.most_similar(lst_words, topn=top):
        lst_out.append(tupla[0])
    return list(set(lst_out))

In [18]:
sentiment_clusters = {}

sentiment_clusters["neutral"] = get_similar_words(['bland','midcentury', 'renewal', 'scant', 'negligence', 'apron', 'grit', 'finishing', 'styling', 'hermoso'], top=30, nlp=glove)
sentiment_clusters["positive"] = get_similar_words(['negative', 'deserve', 'emotion', 'bad', 'firstly', 'honest', 'contrary', 'superlative', 'dislike', 'extraordinary'], top=30, nlp=glove)
sentiment_clusters["negative"] = get_similar_words(['minus', 'constructive', 'flaw', 'criticism', 'downside', 'remark', 'positive', 'drawback', 'downfall', 'gripe'], top=30, nlp=glove)

In [19]:
dic_clusters = {}

dic_clusters["Accuracy"] = get_similar_words(['consistent','helpfulness','corresponded','flawless','glowing','service','responsiveness','professionalism','punctuality','clarity'], top=30, nlp=glove)
dic_clusters["cleanliness"] = get_similar_words(['spotless','beautiful','spacious','neat','comfortable','convenient','bright','welcoming','basic','tidy'], top=30, nlp=glove)
dic_clusters["Communication"] = get_similar_words(['talk','contact','informative','approach','access','coordinate','dealing','manage','polite','soft'], top=30, nlp=glove)
dic_clusters["location"] = get_similar_words(['position','spot','neighbourhood','city','stay','beach','space','street','buildings','views'], top=30, nlp=glove)
dic_clusters["Value"] = get_similar_words(['reasonable','quality','deal','cheap','size','budget','small','choice','location','price'], top=30, nlp=glove)

In [20]:
tot_words = [word for v in dic_clusters.values() for word in v]
X = glove[tot_words]
        
## pca
pca = manifold.TSNE(perplexity=40, n_components=2, init='pca')
X = pca.fit_transform(X)

## create dtf
dtf_GloVe = pd.DataFrame()
for k,v in dic_clusters.items():
    size = len(dtf_GloVe) + len(v)
    dtf_group = pd.DataFrame(X[len(dtf_GloVe):size], columns=["x","y"], index=v)
    dtf_group["cluster"] = k
    dtf_GloVe = dtf_GloVe.append(dtf_group)
        
## plot
%matplotlib notebook
fig, ax = plt.subplots(figsize=(15,10))
sns.scatterplot(data=dtf_GloVe, x="x", y="y", hue="cluster", ax=ax)
#ax.legend().texts[0].set_text(None)
ax.legend()
ax.set(xlabel=None, ylabel=None, xticks=[], xticklabels=[], yticks=[], yticklabels=[])
for i in range(len(dtf_GloVe)):
    ax.annotate(dtf_GloVe.index[i], xy=(dtf_GloVe["x"].iloc[i], dtf_GloVe["y"].iloc[i]), xytext=(5,2), textcoords='offset points', ha='right', va='bottom')



<IPython.core.display.Javascript object>

In [None]:
data_word2vec['comments_stopwordsrm_lemma']

0         great location clean spacious flat would recom...
1         big apartment well equip very good service exc...
2         the check fast flexible the price fair flat is...
3         great location enough space apartment people a...
4         -PRON- experience mix the location flat super ...
                                ...                        
395335    -PRON- stay barcelona great thank eli able hel...
395336    really nice apartment stay -PRON- really well ...
395337                                               normal
395338    great apartment super clean new a lot space th...
395339                                 great stay recommend
Name: comments_stopwordsrm_lemma, Length: 395340, dtype: object

In [21]:
# category
## for BERT
import transformers
import tensorflow as tf
## bert tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
## bert model
nlp = transformers.TFBertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [22]:
## function to apply
def utils_bert_embedding(txt, tokenizer, nlp):
    '''
    Word embedding with Bert (equivalent to nlp["word"]).
    :parameter
        :param txt: string 
        :param tokenizer: transformers tokenizer
        :param nlp: transformers bert
    :return
        tensor sentences x words x vector (1x3x768) 
    '''
    # tokenize sentence to tokens (integers)
    idx = tokenizer.encode(txt)
#     print(type(idx))
    # convert to array of shape (1, num_words+2) - EOS and CLS added
    idx = np.array(idx)[None,:]

    # generate embeddings for each token - output is a tuple
    embedding = nlp(idx)
    
#     # select first member of the tuple, remove first dimension which is 1 to get (num_words,embedding size 712)
#     # exclude CLS and EOS tokens
    X = np.array(embedding[0][0][1:-1])
    return X

In [25]:
training_words = data_word2vec['comments_stopwordsrm_lemma'].dropna().sample(5000)

In [26]:
lst_mean_vecs = [utils_bert_embedding(txt, tokenizer, nlp).mean(0) for txt in tqdm(training_words)]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0%|          | 2/5000 [00:01<52:03,  1.60it/s]  [A
  0%|          | 3/5000 [00:01<42:04,  1.98it/s][A
  0%|          | 4/5000 [00:01<35:40,  2.33it/s][A
  0%|          | 5/5000 [00:02<32:06,  2.59it/s][A
  0%|          | 6/5000 [00:02<30:24,  2.74it/s][A
  0%|          | 7/5000 [00:02<28:28,  2.92it/s][A
  0%|          | 8/5000 [00:03<27:57,  2.98it/s][A
  0%|          | 9/5000 [00:03<28:19,  2.94it/s][A
  0%|          | 10/5000 [00:03<28:13,  2.95it/s][A
  0%|          | 11/5000 [00:04<27:50,  2.99it/s][A
  0%|          | 12/5000 [00:04<26:41,  3.11it/s][A
  0%|          | 13/5000 [00:04<25:36,  3.25it/s][A
  0%|          | 14/5000 [00:05<24:44,  3.36it/s][A
  0%|          | 15/5000 [00:05<25:48,  3.22it/s][A
  0%|          | 16/5000 [00:05<25:42,  3.23it/s][A
  0%|          | 17/5000 [00:06<27:09,  3.06it/s][A
  0%|          | 18/5000 [00:06<26:16,  3.16it/s][A
  0%|          | 19/5000 [00:06<25:54,  

In [27]:
# Create y as {label:mean_vector}
dic_y = {k:utils_bert_embedding(v, tokenizer, nlp).mean(0) for k,v in tqdm(dic_clusters.items())}


  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:00<00:00,  5.64it/s][A
 33%|███▎      | 2/6 [00:00<00:00,  5.73it/s][A
 50%|█████     | 3/6 [00:00<00:00,  5.84it/s][A
 67%|██████▋   | 4/6 [00:00<00:00,  5.92it/s][A
 83%|████████▎ | 5/6 [00:00<00:00,  5.87it/s][A
100%|██████████| 6/6 [00:01<00:00,  5.82it/s]


In [37]:
cat_dic = {"cleanliness": dic_clusters['cleanliness'],
            "value": dic_clusters['Value'],
            "accuracy": dic_clusters["Accuracy"],
            "communicate": dic_clusters["Communication"],
            "location": dic_clusters["location"]}
print(cat_dic)

{'cleanliness': ['lovely', 'look', 'welcoming', 'perfect', 'handsome', 'rooms', 'quite', 'comfortable', 'tidy', 'surroundings', 'easy', 'stylish', 'convenient', 'spacious', 'shiny', 'perfectly', 'sturdy', 'charming', 'wonderful', 'basic', 'luxurious', 'simple', 'decent', 'elegant', 'spotless', 'good', 'nice', 'gorgeous', 'neat', 'sleek', 'room', 'surprisingly', 'bright', 'pretty', 'beautiful', 'very', 'cheerful', 'pleasant', 'attractive', 'looks'], 'value': ['what', 'quality', 'if', 'yet', 'rather', 'better', 'reasonable', 'choice', 'well', 'even', 'need', 'but', 'enough', 'size', 'way', 'it', 'location', '.', 'deal', 'budget', 'amount', 'less', 'so', 'much', 'this', 'example', 'small', 'only', 'because', 'good', 'cheap', 'cost', 'fact', 'price', 'given', 'actually', 'make', 'same', 'that', 'though'], 'accuracy': ['tenacity', 'fairness', 'quality', 'helpfulness', 'cleanliness', 'mastery', 'flawless', 'reliability', 'corresponded', 'musicianship', 'aggressiveness', 'qualities', 'origina

In [38]:
cat_dic.keys()

dict_keys(['cleanliness', 'value', 'accuracy', 'communicate', 'location'])

In [39]:
data['category_pred'] = 1

In [40]:
for i in range(len(data['review_clean'])):
    counter = collections.defaultdict(list)
    word = nltk.RegexpTokenizer(r'\w+').tokenize(data['review_clean'][i])
    if len(word) != 0:
        for j in word:
            for cat in cat_dic:
                if j in cat_dic[cat]:
                    counter[cat].append(j)
          
    elif word == 'NA':
        counter["location"] = 1
    if counter == {}:
        counter["location"] = 1
    print(counter)
    data['category_pred'][i] = max(counter, key = counter.get)

defaultdict(<class 'list'>, {'value': ['location', 'reasonable', 'price'], 'cleanliness': ['very'], 'communicate': ['very']})
defaultdict(<class 'list'>, {'value': ['this', 'well'], 'cleanliness': ['stylish', 'easy', 'beautiful'], 'communicate': ['easy', 'well'], 'accuracy': ['service'], 'location': ['in', 'building', 'stay', 'well']})
defaultdict(<class 'list'>, {'cleanliness': ['very', 'lovely', 'lovely'], 'communicate': ['very', 'get', 'not', 'any', 'well'], 'location': ['the', 'the', 'building', 'the', 'neighbourhood', 'just', 'around', 'the', 'but', 'where', 'the', 'the', 'well', 'the', 'the', 'space'], 'value': ['but', 'well', 'this']})
defaultdict(<class 'list'>, {'value': ['only', 'what', 'it', 'so', 'much'], 'location': ['only', 'the', 'in', 'the', 'it', 'so', 'in', 'the'], 'communicate': ['you', 'so', 'more', 'very'], 'cleanliness': ['very', 'tidy']})
defaultdict(<class 'list'>, {'cleanliness': ['lovely', 'room', 'spacious', 'good'], 'value': ['good', 'location'], 'location':

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
data

Unnamed: 0,id,processed_comments,category,topic,score,sentiments
0,4.340133e+08,super fast easy checkin friendly service price...,checkin,"[stay, feel]",13.0,
1,2.081364e+08,fantastic location fantastic host reasonable p...,value,"[location, reasonable, price]",,
2,5.744510e+17,super happy stylish comfy apartment family eas...,value,[well],,
3,5.281465e+07,mar made u feel welcomed great direction guest...,value,[well],,
4,4.758410e+17,andres amazing host brnot apartment see pictur...,cleanliness,[tidy],,
...,...,...,...,...,...,...
176,4.258322e+08,amazing place location host br star,location,[place],,
177,7.395915e+07,great hostbrexcellent locationbrclean comforta...,value,[much],,
178,1.822565e+08,nice clean apartment located safe neighborhood...,cleanliness,"[nice, good]",,
179,9.683854e+07,mels place great easy get public transport eve...,checkin,[stay],,


In [44]:
# sentiments
# Create y as {label:mean_vector}
dic_y_sentiment = {k:utils_bert_embedding(v, tokenizer, nlp).mean(0) for k,v in tqdm(sentiment_clusters.items())}


  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:00<00:00,  5.39it/s][A
 67%|██████▋   | 2/3 [00:00<00:00,  5.46it/s][A
100%|██████████| 3/3 [00:00<00:00,  5.53it/s]


In [45]:
sent_dic = {"positive": sentiment_clusters['positive'],
            "neutral": sentiment_clusters['neutral'],
            "negative": sentiment_clusters["negative"]}
print(sent_dic)

{'positive': ['dislike', 'what', 'understand', 'think', 'truly', 'extraordinary', 'unfortunately', 'quite', 'honest', 'nothing', 'impression', 'indeed', 'contrary', 'respect', 'things', 'regard', 'whatever', 'always', 'sense', 'firstly', 'superlative', 'belief', 'bad', 'thing', 'really', 'simply', 'negative', 'good', 'deserve', 'anything', 'sort', 'reason', 'fact', 'obviously', 'feelings', 'certainly', 'necessarily', 'kind', 'something', 'emotion'], 'neutral': ['tasteful', 'shabby', 'flamboyance', 'garish', 'craftsmanship', 'uninspired', 'devoid', 'refinement', 'bland', 'apron', 'aesthetic', 'esthetic', 'styling', 'stylish', 'finishing', 'minimalist', 'ornamentation', 'originality', 'renewal', 'boxy', 'simplicity', 'midcentury', 'grit', 'embellishment', 'facade', 'workmanship', 'panache', 'elegance', 'practicality', 'lacking', 'sleek', 'negligence', 'sartorial', 'aesthetics', 'exterior', 'facades', 'decor', 'scant', 'hermoso', 'inventiveness'], 'negative': ['flaw', 'weakness', 'questio

In [46]:
data['sentiment_pred']=1

In [48]:
for i in range(len(data['review_clean'])):
    counter_sent = collections.defaultdict(list)
    word = nltk.RegexpTokenizer(r'\w+').tokenize(data['review_clean'][i])
    if len(word) != 0:
        for j in word:
            for cat in sent_dic:
                if j in sent_dic[cat]:
                    counter_sent[cat].append(j)
                      
    elif word == 'NA':
        counter_sent["positive"] = 1
    if counter_sent == {}:
        counter_sent["positive"] = 1
    print(counter_sent)
    data['sentiment_pred'][i] = max(counter_sent, key = counter_sent.get)


defaultdict(<class 'list'>, {'positive': 1})
defaultdict(<class 'list'>, {'neutral': ['stylish']})
defaultdict(<class 'list'>, {'neutral': ['decor']})
defaultdict(<class 'list'>, {'positive': ['what', 'really']})
defaultdict(<class 'list'>, {'positive': ['good']})
defaultdict(<class 'list'>, {'positive': 1})
defaultdict(<class 'list'>, {'positive': ['really']})
defaultdict(<class 'list'>, {'positive': ['kind']})
defaultdict(<class 'list'>, {'positive': 1})
defaultdict(<class 'list'>, {'positive': 1})
defaultdict(<class 'list'>, {'positive': ['good']})
defaultdict(<class 'list'>, {'negative': ['question'], 'positive': ['good']})
defaultdict(<class 'list'>, {'positive': 1})
defaultdict(<class 'list'>, {'positive': ['really']})
defaultdict(<class 'list'>, {'positive': ['really', 'fact'], 'negative': ['fact']})
defaultdict(<class 'list'>, {'positive': ['good'], 'negative': ['doubt']})
defaultdict(<class 'list'>, {'positive': ['good']})
defaultdict(<class 'list'>, {'positive': ['bad']})
def

In [52]:
data.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,Review,Category,Sentiment,review_clean,category_pred,sentiment_pred
0,11391287,208136400.0,31/10/17,68485599,Ali,Fantastic location and fantastic hosts for a v...,location,positive,fantastic location and fantastic host for a ve...,cleanliness,positive
1,50319142,5.74451e+17,3/3/22,81633638,Christophe,Super happy with this stylish and comfy apartm...,communicate,positive,super happy with this stylish and comfy apartm...,value,neutral
2,272282,52814650.0,2/11/15,8528459,Rebecca,Mar made us feel very welcomed with great dire...,communicate,positive,mar made u feel very welcomed with great direc...,cleanliness,neutral
3,4683548,4.75841e+17,18/10/21,190705793,Ron,Andres is an amazing host! <br/>Not only the a...,accurate,positive,andres is an amazing host brnot only the apart...,communicate,positive
4,7838608,192308800.0,10/9/17,12034187,Giuseppe,A lovely room which was spacious and clean. Al...,cleanliness,positive,a lovely room which wa spacious and clean also...,location,positive


In [53]:
with open('unsupervised_bert_cat_preds.txt', 'w') as f:
    for item in data['category_pred']:
        f.write("%s\n" % item)

In [54]:
with open('unsupervised_bert_sent_preds.txt', 'w') as f:
    for item in data['sentiment_pred']:
        f.write("%s\n" % item)

In [58]:
import pickle

f = open('/content/drive/MyDrive/SMU_MITB_NLP/project/unsupervised_bert_cat.model', 'wb')
pickle.dump(nlp, f)
f.close()

  3%|▎         | 11403/384457 [1:22:59<45:15:18,  2.29it/s]


INFO:tensorflow:Assets written to: ram://fee7e030-8cd1-45a2-9739-68fcf50eb1c4/assets


INFO:tensorflow:Assets written to: ram://fee7e030-8cd1-45a2-9739-68fcf50eb1c4/assets


In [59]:
f = open('/content/drive/MyDrive/SMU_MITB_NLP/project/unsupervised_bert_cat.model', 'rb')
model = pickle.load(f)



