In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDERNAME = 'NLP_project/dataset'
assert FOLDERNAME is not None, "[!] Enter the foldername."

#Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd /content/drive/My\ Drive/$FOLDERNAME/

Mounted at /content/drive
[Errno 2] No such file or directory: '/content/drive/My Drive/NLP_project/dataset/'
/content


In [2]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla P100-PCIE-16GB, n_gpu: 1


In [3]:
!pip install transformers -q
!pip install pandarallel -q
!pip install sentencepiece -q
!pip install neptune-client -q


[K     |████████████████████████████████| 3.4 MB 9.2 MB/s 
[K     |████████████████████████████████| 895 kB 37.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 36.7 MB/s 
[K     |████████████████████████████████| 596 kB 53.7 MB/s 
[K     |████████████████████████████████| 61 kB 531 kB/s 
[?25h  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2 MB 6.9 MB/s 
[K     |████████████████████████████████| 287 kB 8.3 MB/s 
[K     |████████████████████████████████| 829 kB 37.5 MB/s 
[K     |████████████████████████████████| 53 kB 1.9 MB/s 
[K     |████████████████████████████████| 180 kB 58.1 MB/s 
[K     |████████████████████████████████| 131 kB 56.7 MB/s 
[K     |████████████████████████████████| 79 kB 7.6 MB/s 
[K     |████████████████████████████████| 8.4 MB 42.9 MB/s 
[K     |████████████████████████████████| 138 kB 44.7 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[K     |████████████████

In [16]:
import numpy as np
import pandas as pd
import regex as re
import random
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,average_precision_score, precision_score,precision_recall_curve
from tqdm.notebook import tqdm
from tqdm import trange
import warnings
warnings.filterwarnings('ignore')
import pickle
import nltk
import math
import os
import json
import random
import re
import torch
import torch.nn as nn
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel



In [None]:
reviews_path = '/content/drive/MyDrive/restaurant_reviews/res_reviews_final.json'
import json
f = open(reviews_path)
data = json.load(f)

In [11]:
len(data.keys())

50762

In [12]:
keys_ = list(data.keys())
res_reviews = {}
for i in range(80):
  key_ = keys_[i]
  for a in data[key_]['review_list']:
    if(a['rating']!=None):
      res_reviews[a['text']] = a['rating']

In [13]:
res_reviews

{"Elephant's contacted me the same day I posted my original review to follow up on my experience.  Wow!  They left a very nice message verifying that the PDX location does participate in ALL of their punch cards, including coffee and cookies.  Also, they said that they're reviewing this with all of their staff, so something like this doesn't happen in the future.  Thanks Elephant's!": 4.0,
 "I'm not usually a fan of airport food. I usually like to arrive and get to where I need to go. However, I had no choice this past weekend on our first trip to Portland but to stay at the airport for four hours and wait for my boyfriend's flight to arrive. I was hungry and this place seemed popular compared to the other surrounding food areas. I got their weekly special french dip sandwich, not really expecting much except to fuel my stomach but was surprised how good it was! It had the meat with au ju, mayo, chipotle, and blue cheese. Wish I had time to get a sandwich before we headed home.": 5.0,


In [None]:
reviews_df = pd.read_csv('/content/drive/MyDrive/NLP_project/dataset/yelp_preprocessed_1M.csv')

In [None]:
print(reviews_df.columns)

Index(['Unnamed: 0', 'date', 'useful', 'text', 'cool', 'funny', 'review_id',
       'user_id', 'stars', 'business_id', 'sentiment'],
      dtype='object')


In [None]:
reviews_df = reviews_df[0:100]

In [None]:
model_path = '/content/drive/MyDrive/NLP_project/arg_miningrobertalarge_SEED_4_dense_layer_epoc_3_lr_1e-05_b_s_16_accumulation_steps_2_input_type_kp_arg_topic_preee.pt'

In [None]:
class NonPoolerTransformer(torch.nn.Module):

    def __init__(self):
        super(NonPoolerTransformer, self).__init__()
        
        #Instantiating Pre trained model object 
        self.model_layer = AutoModel.from_pretrained(model_path)

        #Layers
        # the first dense layer will have 768 if base model is used and 
        # 1024 if large model is used

        self.dense_layer_1 = nn.Linear(1024, 256)
        self.dropout = nn.Dropout(0.4)
        self.dense_layer_2 = nn.Linear(256, 128)
        self.dropout_2 = nn.Dropout(0.2)
        self.cls_layer = nn.Linear(128, 1, bias = True)
        self.sigmoid = nn.Sigmoid()

    def forward(self,input_ids, attention_masks):

        hidden_state = self.model_layer(input_ids=input_ids, attention_mask=attention_masks)[0]
        pooled_output = hidden_state[:, 0]

        x = self.dense_layer_1(pooled_output)
        x = self.dropout(x)
        x_1 = self.dense_layer_2(x)
        x_2 = self.dropout_2(x_1)

        logits = self.cls_layer(x_2)
        output = self.sigmoid(logits)

        return output

In [None]:
model = torch.load(model_path)
tokenizer = AutoTokenizer.from_pretrained('roberta-large')


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [25]:
#make reviews and key_points dataset
key_point_df =df = pd.read_csv('/content/drive/MyDrive/NLP_project/dataset/res_reviews.csv')


positive_kp_df = key_point_df['Positive']
# print(positive_kp_df.head())
negative_kp_df = key_point_df['Negative']
# print(negative_kp_df.head())

reviews_text_df = reviews_df['text']

reviews_list = reviews_text_df.values.tolist()
kp_list = positive_kp_df.values.tolist()
kp_list.extend(negative_kp_df.values.tolist())

print(reviews_list)
print(kp_list)

["Elephant's contacted me the same day I posted my original review to follow up on my experience.  Wow!  They left a very nice message verifying that the PDX location does participate in ALL of their punch cards, including coffee and cookies.  Also, they said that they're reviewing this with all of their staff, so something like this doesn't happen in the future.  Thanks Elephant's!", "I'm not usually a fan of airport food. I usually like to arrive and get to where I need to go. However, I had no choice this past weekend on our first trip to Portland but to stay at the airport for four hours and wait for my boyfriend's flight to arrive. I was hungry and this place seemed popular compared to the other surrounding food areas. I got their weekly special french dip sandwich, not really expecting much except to fuel my stomach but was surprised how good it was! It had the meat with au ju, mayo, chipotle, and blue cheese. Wish I had time to get a sandwich before we headed home.", 'If one mus

In [28]:
final_list = []

for review in reviews_list:
  for kp in kp_list:
    final_list.append([review,kp])

dataset = pd.DataFrame(final_list,columns = ['review','key_point'])

In [29]:
final_list = []

for review in reviews_list:
  for kp in kp_list:
    final_list.append([review,kp])

dataset = pd.DataFrame(final_list,columns = ['review','key_point'])

In [30]:
print(dataset.head())

                                              review                                      key_point
0  Elephant's contacted me the same day I posted ...  It is preferable to spend the money elsewhere
1  Elephant's contacted me the same day I posted ...  It is preferable to spend the money elsewhere
2  Elephant's contacted me the same day I posted ...                       The food is good/healthy
3  Elephant's contacted me the same day I posted ...                          The food is delicious
4  Elephant's contacted me the same day I posted ...       Absolutely Pathetic! Never again Budget!


In [31]:
from torch.utils.data import (DataLoader, RandomSampler, WeightedRandomSampler, SequentialSampler, TensorDataset)

In [32]:
def matching_score(dataframe):
  all_attention_masks=[]
  all_input_ids=[]
  match_scores = []

  #print(dataframe.shape[0])
  for i in range(dataframe.shape[0]):
    encoded_input = tokenizer(dataframe.iloc[i]['all_text'], padding='max_length')
    #print(encoded_input['attention_mask'])
    # print(len(encoded_input['attention_mask']))
    all_attention_masks.append(encoded_input['attention_mask'][0:512])
    all_input_ids.append(encoded_input['input_ids'][0:512])

  all_attention_masks = torch.tensor(all_attention_masks).squeeze()
  all_input_ids = torch.tensor(all_input_ids).squeeze()


  dataset = TensorDataset(all_input_ids, all_attention_masks)


  test_dataloader = DataLoader(dataset, batch_size=1)
      
  with torch.no_grad():
      acc_epoch = []

      epoch_iterator = tqdm(test_dataloader, desc="Iteration")
      for step, batch in enumerate(epoch_iterator):
          model.eval()
              

          b_input_ids, b_input_mask = batch[0].to(device), batch[1].to(device)
          ypred = model(b_input_ids, b_input_mask)
          # print(ypred)
          match_scores.append(ypred[0][0].item())
  return match_scores


In [33]:
sentence_df = pd.DataFrame([], columns =['Review', 'Sentence', 'Key_point'])

In [35]:
for index, row in dataset.iterrows():
  if row.review!= None:
    splitted = row.review.split('.')
    for sentence in splitted:
      print(sentence)
      new_row = [row.review, sentence, row.key_point]
      sentence_df.loc[len(sentence_df.index)] = new_row
  else:
      continue

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  I don't seek out vegan food, but will try it when available and I'm always surprised at how good it can be
  Today I had "Shelly's Wrap" which is an amazing combination of broccoli, cabbage, kale, snow peas, apples, grapes, and hazelnuts swaddled in a spinach wrap
  Absolutely incredible! But the best part is it's only $6
75 and I was completely satisfied:  wowed my taste buds and filled my tummy
  What more could I ask for?
I'm so glad to see Elephants at PDX! I was looking for something healthy and flavorful -- practically impossible in an airport which is why I usually pack my own food
  I don't seek out vegan food, but will try it when available and I'm always surprised at how good it can be
  Today I had "Shelly's Wrap" which is an amazing combination of broccoli, cabbage, kale, snow peas, apples, grapes, and hazelnuts swaddled in a spinach wrap
  Absolutely incredible! But the best part is it's only $6
75 and I wa

In [36]:
sentence_df.head()

Unnamed: 0,Review,Sentence,Key_point
0,Elephant's contacted me the same day I posted ...,Elephant's contacted me the same day I posted ...,It is preferable to spend the money elsewhere
1,Elephant's contacted me the same day I posted ...,Wow! They left a very nice message verifyin...,It is preferable to spend the money elsewhere
2,Elephant's contacted me the same day I posted ...,"Also, they said that they're reviewing this ...",It is preferable to spend the money elsewhere
3,Elephant's contacted me the same day I posted ...,Thanks Elephant's!,It is preferable to spend the money elsewhere
4,Elephant's contacted me the same day I posted ...,Elephant's contacted me the same day I posted ...,It is preferable to spend the money elsewhere


In [37]:
sentence_df['all_text'] = sentence_df['Sentence']+sentence_df['Key_point']

In [38]:
sentence_df.head()

Unnamed: 0,Review,Sentence,Key_point,all_text
0,Elephant's contacted me the same day I posted ...,Elephant's contacted me the same day I posted ...,It is preferable to spend the money elsewhere,Elephant's contacted me the same day I posted ...
1,Elephant's contacted me the same day I posted ...,Wow! They left a very nice message verifyin...,It is preferable to spend the money elsewhere,Wow! They left a very nice message verifyin...
2,Elephant's contacted me the same day I posted ...,"Also, they said that they're reviewing this ...",It is preferable to spend the money elsewhere,"Also, they said that they're reviewing this ..."
3,Elephant's contacted me the same day I posted ...,Thanks Elephant's!,It is preferable to spend the money elsewhere,Thanks Elephant's!It is preferable to spend ...
4,Elephant's contacted me the same day I posted ...,Elephant's contacted me the same day I posted ...,It is preferable to spend the money elsewhere,Elephant's contacted me the same day I posted ...


In [41]:
sentence_df.to_csv('sentence_splitted_review_kp.csv')

In [42]:
print(sentence_df.shape)

(28614, 4)


In [43]:
sentence_df['score'] = matching_score(sentence_df) 

In [43]:
sentence_df.to_csv('sentence_df_scores.csv')

In [69]:
sentence_df.head()

Unnamed: 0,Review,Sentence,Key_point,all_text,score
0,Elephant's contacted me the same day I posted ...,Elephant's contacted me the same day I posted ...,It is preferable to spend the money elsewhere,Elephant's contacted me the same day I posted ...,0.096677
1,Elephant's contacted me the same day I posted ...,Wow! They left a very nice message verifyin...,It is preferable to spend the money elsewhere,Wow! They left a very nice message verifyin...,0.04518
2,Elephant's contacted me the same day I posted ...,"Also, they said that they're reviewing this ...",It is preferable to spend the money elsewhere,"Also, they said that they're reviewing this ...",0.04723
3,Elephant's contacted me the same day I posted ...,Thanks Elephant's!,It is preferable to spend the money elsewhere,Thanks Elephant's!It is preferable to spend ...,0.093105
4,Elephant's contacted me the same day I posted ...,Elephant's contacted me the same day I posted ...,It is preferable to spend the money elsewhere,Elephant's contacted me the same day I posted ...,0.047501


In [69]:
matched_df = pd.DataFrame([], columns =['Review', 'Key_point', 'Score'])
new_df = sentence_df

In [69]:
pos_matched_df = pd.DataFrame([], columns =['Review', 'Key_point', 'Score'])
neg_matched_df = pd.DataFrame([], columns =['Review', 'Key_point', 'Score'])
no_matched_df = pd.DataFrame([], columns =['Review', 'Key_point', 'Score'])

i=0
while i<new_df.shape[0]:
  flag = False
  for j in range(60):
    rowSeries = new_df.iloc[i+j]
    if rowSeries['Score']>0.95:
      flag = True
      new_row = [rowSeries.Review, rowSeries.Key_point, rowSeries.Score]
      pos_matched_df.loc[len(pos_matched_df.index)] = new_row
  for j in range(60,120):
    rowSeries = new_df.iloc[i+j]
    if rowSeries['Score']>0.95:
      flag = True
      new_row = [rowSeries.Review, rowSeries.Key_point, rowSeries.Score]
      neg_matched_df.loc[len(neg_matched_df.index)] = new_row
  if flag == False:
    row = new_df.iloc[i]
    new_row = [row.Review, row.Key_point, 0]
    no_matched_df.loc[len(no_matched_df.index)] = new_row
  i = i+120

In [70]:
# pos_matched_df.head()

In [71]:
# neg_matched_df.head()

In [72]:
# no_matched_df.head()
# no_matched_df['Review'][0]

In [86]:
pos_kp_coverage = ((pos_matched_df.groupby('Key_point').size()/reviews_df.shape[0])*100)
pos_kp_coverage_top10 = pos_kp_coverage.nlargest(10)
print(pos_kp_coverage_top10)

Key_point
    We were quite impressed. Food was served hot and delicious  18.606353
                       This is one of my favourite restaurants  16.307174
                 It has a fun atmosphere, great for big groups  14.508899
                           A good place to stop for quick food  13.749453
                      Lots of seating! Fast, friendly service!  12.856858
                                                 Check it out!  10.764425
 Has amazing food, and is affordable, will definitely go again  10.437538
                        The dishes have deep and spicy flavors  10.123426
         I've never had a lobster roll quite like this before.   9.407286
                             Staff is attentive. Nice service.   8.508594
dtype: float64


In [87]:
neg_kp_coverage = ((neg_matched_df.groupby('Key_point').size()/reviews_df.shape[0])*100)
neg_kp_coverage_top10 = neg_kp_coverage.nlargest(10)
print(neg_kp_coverage_top10)

Key_point
                                                                     The service was slow as a snail  16.505491
                                           This is one of the worst restaurants I have ever been to.  16.359400
                                                               This was the worst meal I've ever had  14.747859
                                                     Smelly environment, very bad ventilation system  12.709188
                                                              The atmosphere is so loud and anxious!  11.675271
 I've eaten at this location and others at the past and this is the worst experience I've had so far  10.509443
                                                                             the food was just bland   9.735558
                                                       Food is nothing special but definitely pricey   8.508459
                                                               The menu though is quite limite

In [None]:
neg_kp_coverage.to_csv("neg_kp_coverage.csv")

In [None]:
pos_kp_coverage.to_csv("pos_kp_coverage.csv")