In [2]:
import pandas as pd
import datetime
import re
from re import sub
import warnings

from sklearn import preprocessing
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression

import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")

import seaborn as sns
# import matplotlib as plt
import matplotlib.pyplot as plt

from yellowbrick.features import rank2d
from yellowbrick.classifier import confusion_matrix
from yellowbrick.classifier import classification_report
from yellowbrick.regressor import prediction_error, ResidualsPlot

# Change pandas viewing options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielacollaguazo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importing Data

In [3]:
# importing listings dataset
df_listings = pd.read_csv('../data/new-york-city-airbnb-open-data/listings.csv')

# importing the reviews text with polarity
df_reviews_w_polarity = pd.read_csv('../variable_exploration/dc/output/reviews_with_sentiment_and_lang.csv')

## Selecting reviews only in English

In [3]:
# getting only reviews in English
df_rev_eng=df_reviews_w_polarity[df_reviews_w_polarity.review_lang=='en']

# slicing DF
# df_rev_pol = df_rev_eng.loc[:,['id','listing_id','polarity']]

In [4]:
df_rev_eng.review_lang.value_counts(dropna=False)

en    1009583
Name: review_lang, dtype: int64

In [5]:
df_rev_eng.shape

(1009583, 8)

## Formatting Price

In [6]:
# renaming id column
df_listings.rename(columns={"id": "listing_id"}, inplace=True)

# formatting price to float values
df_listings.price = [float(sub(r'[^\d.]', '', r['price'])) for i, r in df_listings.iterrows()]

## Creating features target dataset

In [7]:
df_listings = df_listings[['listing_id', 'price']]

In [8]:
df_rev_eng = df_rev_eng[['listing_id', 'comments']]

In [9]:
# merging colums wit normalized ratings and the price column
df_temp = pd.merge(left=df_rev_eng, right=df_listings, on='listing_id')

# remove null values
df_temp = df_temp.dropna()

df_temp = df_temp.set_index('listing_id')

In [10]:
df_temp.head()

Unnamed: 0_level_0,comments,price
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2060,"very nice neighborhood,close enough to ""A"" tra...",100.0
2595,I've stayed with my friend at the Midtown Cast...,225.0
2595,"We've been staying here for about 9 nights, en...",225.0
2595,We had a wonderful stay at Jennifer's charming...,225.0
2595,Hi to everyone!\r\nWould say our greatest comp...,225.0


## Text pre-processing

### Cleaning

In [11]:
def text_cleaner(s):
#     print(s)
#     Replace special characters with ''
    stripped = re.sub('[^\w\s]','',s)
    stripped = re.sub('_','',stripped)
    
#     Change any whitespace to one space
    stripped = re.sub('\s',' ', stripped)
    
#     Remove numbers
    stripped = re.sub('[\d]','',stripped)
    stripped = stripped.lower()
#     To lowercase
    
#     Remove start and end white spaces
    stripped = stripped.strip()
    
    return stripped

In [12]:
df_temp['cleaned_comments'] = df_temp.apply(lambda row: text_cleaner(row['comments']), axis=1)

In [35]:
# df_temp.to_csv('df_temp.csv')

### Tokenization 

In [14]:
# df_temp['tokenized_comments'] = df_temp.apply(lambda row: nltk.word_tokenize(row['cleaned_comments']), axis=1)

In [15]:
# df_temp.head()

### Removing Stopwords

In [16]:
# stop_words = set(nltk.corpus.stopwords.words('english'))

In [17]:
# df_temp['tokenized_comments_nostop'] = df_temp['tokenized_comments'].apply(lambda x:
#                                                                                   [item for item in x 
#                                                                                    if item not in stop_words])

In [18]:
# df_temp.head()

In [19]:
# df_temp['comments_clean']

In [4]:
df_temp = pd.read_csv('df_temp.csv')

## Splitting training and testing

In [5]:
Xr_train, Xr_test, yr_train, yr_test = tts(df_temp.cleaned_comments, df_temp.price, test_size=0.2)

In [6]:
training_data = pd.concat([Xr_train, yr_train], ignore_index=True, axis=1)

In [7]:
test_data = pd.concat([Xr_test, yr_test], ignore_index=True, axis=1)

In [8]:
training_data.shape , test_data.shape

((807666, 2), (201917, 2))

In [9]:
test_data[0].head()

145214    i stayed at olivias for a pretty long time it ...
201031    great fairly central location thats super easy...
493775    is nice location marcs was very friendly excel...
557839    my weekend stay in ny was complemented with a ...
309164    the apartment is exactly as listed  very cute ...
Name: 0, dtype: object

## TfidfVectorizer

In [43]:
# Removing terms that appear in more than 80%
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range=(4,4), 
                                 stop_words='english', lowercase=False, max_df=0.8, min_df=10)

In [44]:
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(df_temp.cleaned_comments)

In [45]:
len(tfidf_vectorizer.get_feature_names())

55468

In [46]:
X_0 = tfidf_vectorizer_vectors[0:100000]
X_1 = tfidf_vectorizer_vectors[100001:200000]
X_2 = tfidf_vectorizer_vectors[200001:300000]
X_3 = tfidf_vectorizer_vectors[300001:400000]
X_4 = tfidf_vectorizer_vectors[400001:500000]
X_5 = tfidf_vectorizer_vectors[500001:600000]
X_6 = tfidf_vectorizer_vectors[600001:700000]
X_7 = tfidf_vectorizer_vectors[700001:800000]
X_8 = tfidf_vectorizer_vectors[800001:900000]
X_9 = tfidf_vectorizer_vectors[900001:1009583]

In [51]:
df_0=pd.DataFrame(X_0.todense())
df_1=pd.DataFrame(X_1.todense())
df_2=pd.DataFrame(X_2.todense())
df_3=pd.DataFrame(X_3.todense())
df_4=pd.DataFrame(X_4.todense())
df_5=pd.DataFrame(X_5.todense())
df_6=pd.DataFrame(X_6.todense())
df_7=pd.DataFrame(X_7.todense())
df_8=pd.DataFrame(X_8.todense())
df_9=pd.DataFrame(X_9.todense())

***

Trying to get the top terms for each review
Source: https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix

In [41]:
def top_n_idx_sparse(matrix, n):
    '''Return index of top n values in each row of a sparse matrix'''
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]])
    return top_n_idx

In [42]:
top_n_idx_sparse(X_0,10)[0]

array([ 44456, 224405,  21904,  40234,  34087, 213353,  83353, 236094,
       192935,  22546], dtype=int32)