In [1]:
import nltk
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from  sklearn.decomposition import NMF
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patheffects as PathEffects
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
with open('data/filtered/tripadvisor_filtered_reviews_lemm.pkl' , 'rb') as picklefile: 
    df = pickle.load(picklefile)

In [3]:
df.head(3)

Unnamed: 0,page_url,url,eng_rev_num,snowforecast_url,resort_name,region,country,state_region,top,bottom,vertical_drop,acres_of ski,pistes,halfpipes,terrain_parks,rev_full,rev_id
0,https://www.tripadvisor.com/Attraction_Review-...,https://www.tripadvisor.com/Attraction_Review-...,58,https://www.snow-forecast.com/resorts/Bressanone,Plose Brixen,Europe,Italy,,2499,1067,1432,,35.0,0,0,"Hiking, Food, Fun Love the hiking trails to th...",0
1,https://www.tripadvisor.com/Attraction_Review-...,https://www.tripadvisor.com/Attraction_Review-...,58,https://www.snow-forecast.com/resorts/Bressanone,Plose Brixen,Europe,Italy,,2499,1067,1432,,35.0,0,0,Fantastic A perfect place for enjoying the fan...,1
2,https://www.tripadvisor.com/Attraction_Review-...,https://www.tripadvisor.com/Attraction_Review-...,58,https://www.snow-forecast.com/resorts/Bressanone,Plose Brixen,Europe,Italy,,2499,1067,1432,,35.0,0,0,Very nice view of the Dolomites! Great place t...,2


In [4]:
# check for duplicates
df['dups']=df.duplicated(subset=['resort_name','rev_full'],keep='first')

df.groupby('dups')['dups'].count()

dups
False    83541
True      6247
Name: dups, dtype: int64

In [5]:
# drop duplicates
df = df[df.dups==False].copy()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83541 entries, 0 to 89787
Data columns (total 18 columns):
page_url            83541 non-null object
url                 83541 non-null object
eng_rev_num         83541 non-null int64
snowforecast_url    83541 non-null object
resort_name         83541 non-null object
region              83541 non-null object
country             83541 non-null object
state_region        59941 non-null object
top                 83541 non-null int64
bottom              83541 non-null int64
vertical_drop       83541 non-null int64
acres_of ski        69388 non-null object
pistes              78806 non-null float64
halfpipes           83541 non-null int64
terrain_parks       83541 non-null int64
rev_full            83539 non-null object
rev_id              83541 non-null int64
dups                83541 non-null bool
dtypes: bool(1), float64(1), int64(7), object(9)
memory usage: 11.6+ MB


# Get sentences

In [7]:
# break reviews into lists of sentences
df['sentence'] = [sent_tokenize(str(rev)) for rev in df['rev_full']]

In [8]:
# assign an ID to each reaview that can be used as a Key in the future
df['rev_id'] = [i for i in range(len(df))]

In [9]:
# create a daataframe with a sentence per row


rev_list = df['sentence'].tolist()
rev_id = df['rev_id'].tolist()

# create a list of all sentences and all review IDs of the same length
sentence_list = []
review_id = []
for i, rec in enumerate(rev_list):
    for sent in rec:
        sentence_list.append(sent)
        review_id.append(rev_id[i])

In [10]:
sent_dict = {}
sent_dict['rev_id'] = review_id
sent_dict['sentence'] = sentence_list

df_sent = pd.DataFrame.from_dict(sent_dict)

In [11]:
df_sent.head(10)

Unnamed: 0,rev_id,sentence
0,0,"Hiking, Food, Fun Love the hiking trails to th..."
1,0,Amazing panoramic views are perfect for photos...
2,0,I fell in love with the cows!!
3,0,Particularly one that followed us up the hill.
4,0,One of my favorite moments was taking selfies ...
5,0,"The food is tasty , service was a little slow,..."
6,0,It was a bit chilly esp as you climb.visited i...
7,0,Bring a layer to break the wind.
8,0,And the Gondola ride up was quick and easy!!!
9,0,By the way this is a great area for family's w...


In [12]:
# create a lemmatizer function that uses part-of-speech tag from NLTK output

def lemmatizer(text):
    tokens = word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lem_words = []
    for tagged_word in tagged:
        word = tagged_word[0].lower()
        if tagged_word[1].startswith('VB'):
            word_tag = 'v'
        elif tagged_word[1].startswith('JJ'):
            word_tag = 'a'
        elif tagged_word[1].startswith('RB'):
            word_tag = 'r'
        else:
            word_tag = 'n'
                 
        lem_words.append(lemmatizer.lemmatize(word,pos=word_tag))
    return(lem_words)

In [13]:
# lemmatize words in sentences
df_sent['lem_sent_token'] = [lemmatizer(str(rev)) for rev in df_sent['sentence']]

In [14]:
# turn lemmatized tokens back into sting form
df_sent['lem_sent_text'] = [" ".join(lems) for lems in df_sent['lem_sent_token']]

In [15]:
df_sent.head()

Unnamed: 0,rev_id,sentence,lem_sent_token,lem_sent_text
0,0,"Hiking, Food, Fun Love the hiking trails to th...","[hiking, ,, food, ,, fun, love, the, hiking, t...","hiking , food , fun love the hiking trail to t..."
1,0,Amazing panoramic views are perfect for photos...,"[amaze, panoramic, view, be, perfect, for, pho...",amaze panoramic view be perfect for photo to c...
2,0,I fell in love with the cows!!,"[i, fell, in, love, with, the, cow, !, !]",i fell in love with the cow ! !
3,0,Particularly one that followed us up the hill.,"[particularly, one, that, follow, u, up, the, ...",particularly one that follow u up the hill .
4,0,One of my favorite moments was taking selfies ...,"[one, of, my, favorite, moment, be, take, self...",one of my favorite moment be take selfies with...


In [16]:
# pickle lemmatized sentence file
# with open('data/filtered/tripadvisor_filtered_sent_lemm.pkl', 'wb') as picklefile:
#     pickle.dump(df_sent, picklefile)