In [25]:
import pandas as pd
from collections import Counter

In [26]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.tag import pos_tag
from nltk.corpus import stopwords, words
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/chinwen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chinwen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /Users/chinwen/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package words to /Users/chinwen/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [27]:
listingsAll = pd.read_csv("listingsAll.csv")
reviewsAll = pd.read_csv("reviewsAll.csv")
cleaned = pd.read_csv("cleaned_data.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Preprocessing

In [24]:
def score_cat(x):
    if x >= 90:
        return 'A'
    elif (x < 90) & (x >= 75):
        return 'B'
    else:
        return 'C'

### 1. add rating category to the cleaned file

In [43]:
summary_categ = listingsAll[['id', 'summary', 'review_scores_rating']].copy()
summary_categ['rating_categ'] = summary_categ['review_scores_rating'].map(lambda x: score_cat(x))

In [44]:
cleanedPlus = cleaned.copy()
cleanedPlus['rating_categ'] = summary_categ['rating_categ']

In [252]:
cleanedPlus.to_csv("CleanedPlus.csv")

In [45]:
# create dictionary to map listings id to rating category
rating_categ = {}
_ = summary_categ.apply(lambda x: rating_categ.update({x.id: x.rating_categ}), axis=1)

### 2. adjective words in summary for visualization

In [113]:
%%time
# rating score >= 90 (i.e. A)
high_words_list = [word_tokenize(str(e)) for e in summary_categ['summary'][summary_categ['rating_categ'] == 'A']]
high_words = [w for e in high_words_list for w in e]
high_pos = nltk.pos_tag(high_words)

CPU times: user 1min 39s, sys: 519 ms, total: 1min 39s
Wall time: 1min 39s


In [114]:
high_adj = []
for p in high_pos:
    if p[1].startswith('JJ'):
        high_adj.append(p[0].lower())
        
high = sorted(dict(Counter(high_adj)).items(), key=lambda x:x[1], reverse=True)

In [115]:
%%time
# 90 > rating score >= 75 (i.e. B)
mid_words_list = [word_tokenize(str(e)) for e in summary_categ['summary'][summary_categ['rating_categ'] == 'B']]
mid_words = [w for e in mid_words_list for w in e]
mid_pos = nltk.pos_tag(mid_words)

CPU times: user 18.1 s, sys: 68.3 ms, total: 18.2 s
Wall time: 18.2 s


In [116]:
mid_adj = []
for p in mid_pos:
    if p[1].startswith('JJ'):
        mid_adj.append(p[0].lower())
        
mid = sorted(dict(Counter(mid_adj)).items(), key=lambda x:x[1], reverse=True)

In [117]:
%%time
# rating score < 75 (i.e. C)
low_words_list = [word_tokenize(str(e)) for e in summary_categ['summary'][summary_categ['rating_categ'] == 'C']]
low_words = [w for e in low_words_list for w in e]
low_pos = nltk.pos_tag(low_words)

CPU times: user 37.1 s, sys: 73.8 ms, total: 37.2 s
Wall time: 37.3 s


In [118]:
low_adj = []
for p in low_pos:
    if p[1].startswith('JJ'):
        low_adj.append(p[0].lower())
        
low = sorted(dict(Counter(low_adj)).items(), key=lambda x:x[1], reverse=True)

In [119]:
f = open('adj_summary.csv', 'w')
f.write("word#freq#category\n")
for l in high:
    f.write(str(l[0]) + "#" + str(l[1]) + "#" + 'A' + "\n")
for l in mid:
    f.write(str(l[0]) + "#" + str(l[1]) + "#" + 'B' + "\n")
for l in low:
    f.write(str(l[0]) + "#" + str(l[1]) + "#" + 'C' + "\n")

f.close()

### 3. adjective words in review for visualization

In [68]:
r = reviewsAll.copy()

In [89]:
r['comments'] = r['comments'].astype(str)

In [69]:
%%time
r['words'] = r['comments'].map(lambda x: word_tokenize(str(x)))

CPU times: user 9min 3s, sys: 2.97 s, total: 9min 6s
Wall time: 9min 7s


In [70]:
stopWords = set(stopwords.words('english'))

In [71]:
def removeStopwords(row):
    clean_words = []
    for w in row:
        if w not in stopWords:
            clean_words.append(w)
    return clean_words

In [72]:
r['words'] = r['words'].map(lambda x: removeStopwords(x))

In [120]:
%%time
r['pos_tagger']  = r['words'].map(lambda x:pos_tag(x))

CPU times: user 31min 21s, sys: 10.1 s, total: 31min 31s
Wall time: 31min 31s


In [121]:
def find_adj(ps):
    adj = []
    for p in ps:
        if p[1].startswith('JJ'):
            adj.append(p[0].lower()) 
    return adj

In [124]:
def find_noun(ps):
    noun = []
    for p in ps:
        if p[1].startswith('NN'):
            noun.append(p[0].lower()) 
    return noun

In [122]:
r['adj'] = r['pos_tagger'].map(lambda x: find_adj(x))
r['number_adj'] = r['adj'].map(lambda x: len(x))
r['noun'] = r['pos_tagger'].map(lambda x: find_noun(x))
r['number_noun'] = r['noun'].map(lambda x: len(x))
r['number_words'] = r['words'].map(lambda x: len(x))

CPU times: user 12.5 s, sys: 114 ms, total: 12.6 s
Wall time: 12.6 s


In [250]:
r.to_csv('reviewsAllPlus.csv')

In [28]:
r = pd.read_csv("reviewsAllPlus.csv")

In [29]:
r_short = r.drop(['words', 'pos_tagger', 'adj', 'noun', 'Unnamed: 0'], axis=1)

In [32]:
r_short.to_csv('reviewsPlus.csv', index=False, na_rep = 'NA')

In [160]:
# create dictionary to map ratings category to number of adjective words
adj_r_d = {}

In [161]:
def add_to_adj_dict(adj, listing_id):
    if rating_categ[listing_id] not in adj_r_d:
        adj_r_d[rating_categ[listing_id]] = {}
    for word in adj:
        word = word.lower() 
        if word not in adj_r_d[rating_categ[listing_id]]:
            adj_r_d[rating_categ[listing_id]][word] = 0
        adj_r_d[rating_categ[listing_id]][word] += 1

In [162]:
_ = r.apply(lambda x: add_to_adj_dict(x.adj, x.listing_id), axis=1)

In [224]:
%%time
#remove non-English words
for k in adj_r_d:
    for w in (list(adj_r_d[k].keys())[:300]):
        if w not in set(words.words()):
            adj_r_d[k].pop(w, None)

CPU times: user 1min 54s, sys: 12.3 s, total: 2min 6s
Wall time: 2min 6s


In [230]:
words_to_remove = ['un', 'nous', 'es', 'está', 'u', 'las', 'se', 'lo', 'ne', 'el', 'tal', 'den', 
                  'sehr', 'en', 'la', 'een', "'", 'que']
for k in adj_r_d:
    for w in list(adj_r_d[k].keys())[:300]:
        if w in words_to_remove:
            adj_r_d[k].pop(w, None)

In [237]:
hr = sorted(adj_r_d['A'].items(), key=lambda x:x[1], reverse=True)

In [238]:
mr = sorted(adj_r_d['B'].items(), key=lambda x:x[1], reverse=True)

In [239]:
lr = sorted(adj_r_d['C'].items(), key=lambda x:x[1], reverse=True)

In [240]:
f = open('adj_review.csv', 'w')
f.write("word#freq#category\n")
for l in hr:
    f.write(str(l[0]) + "#" + str(l[1]) + "#" + 'A' + "\n")
for l in mr:
    f.write(str(l[0]) + "#" + str(l[1]) + "#" + 'B' + "\n")
for l in lr:
    f.write(str(l[0]) + "#" + str(l[1]) + "#" + 'C' + "\n")

f.close()

### 4. amenities word frequency

In [322]:
df = cleaned[['amenities']].copy()
df['cleaned_amentities'] = df['amenities'].apply(lambda x: x.replace('{','')
                                                 .replace('}','').replace('\"','').replace('/',' ')
                                                 .replace('’','').lower().split(','))
w = [w for e in df['cleaned_amentities'] for w in e]
c = sorted(dict(Counter(w)).items(), key=lambda x:x[1], reverse=True)

In [329]:
f = open('amenities.csv', 'w')
f.write("word#freq\n")
for l in c:
    f.write(str(l[0]) +"#"+ str(l[1]) + "\n")
f.close()