In [3]:
import pandas as pd
import datetime
import re
from re import sub
import warnings

from sklearn import preprocessing
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression

import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")

import seaborn as sns
# import matplotlib as plt
import matplotlib.pyplot as plt

from yellowbrick.features import rank2d
from yellowbrick.classifier import confusion_matrix
from yellowbrick.classifier import classification_report
from yellowbrick.regressor import prediction_error, ResidualsPlot

# Change pandas viewing options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielacollaguazo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importing Data

In [None]:
# importing listings dataset
df_listings = pd.read_csv('../data/new-york-city-airbnb-open-data/listings.csv')

# importing the reviews text with polarity
df_reviews_w_polarity = pd.read_csv('../variable_exploration/dc/output/reviews_with_sentiment_and_lang.csv')

## Selecting reviews only in English

In [None]:
# getting only reviews in English
df_rev_eng=df_reviews_w_polarity[df_reviews_w_polarity.review_lang=='en']

# slicing DF
# df_rev_pol = df_rev_eng.loc[:,['id','listing_id','polarity']]

In [None]:
df_rev_eng.review_lang.value_counts(dropna=False)

In [None]:
df_rev_eng.shape

## Formatting Price

In [None]:
# renaming id column
df_listings.rename(columns={"id": "listing_id"}, inplace=True)

# formatting price to float values
df_listings.price = [float(sub(r'[^\d.]', '', r['price'])) for i, r in df_listings.iterrows()]

## Creating features target dataset

In [None]:
df_listings = df_listings[['listing_id', 'price']]

In [None]:
df_rev_eng = df_rev_eng[['listing_id', 'comments']]

In [None]:
# merging colums wit normalized ratings and the price column
df_temp = pd.merge(left=df_rev_eng, right=df_listings, on='listing_id')

# remove null values
df_temp = df_temp.dropna()

df_temp = df_temp.set_index('listing_id')

In [None]:
df_temp.head()

## Text pre-processing

### Cleaning

In [None]:
def text_cleaner(s):
#     print(s)
#     Replace special characters with ''
    stripped = re.sub('[^\w\s]','',s)
    stripped = re.sub('_','',stripped)
    
#     Change any whitespace to one space
    stripped = re.sub('\s',' ', stripped)
    
#     Remove numbers
    stripped = re.sub('[\d]','',stripped)
    stripped = stripped.lower()
#     To lowercase
    
#     Remove start and end white spaces
    stripped = stripped.strip()
    
    return stripped

In [None]:
df_temp['cleaned_comments'] = df_temp.apply(lambda row: text_cleaner(row['comments']), axis=1)

In [4]:
# df_temp.to_csv('df_temp.csv')

In [5]:
df_temp = pd.read_csv('df_temp.csv')

## Splitting training and testing

In [6]:
Xr_train, Xr_test, yr_train, yr_test = tts(df_temp.cleaned_comments, df_temp.price, test_size=0.2)

In [7]:
training_data = pd.concat([Xr_train, yr_train], ignore_index=True, axis=1)

In [8]:
test_data = pd.concat([Xr_test, yr_test], ignore_index=True, axis=1)

In [9]:
training_data.shape , test_data.shape

((807666, 2), (201917, 2))

In [10]:
test_data[0].head()

966440    amazing place for groups  the space was much m...
903474    wonderful location thoughtful host and the per...
237069    great balcony and fun place to stay in william...
257454    very clean space stayed in room  where there w...
705064    the place is great and spacious it easily acco...
Name: 0, dtype: object

## TfidfVectorizer

In [11]:
# Removing terms that appear in more than 80%
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range=(4,4), 
                                 stop_words='english', lowercase=False, max_df=0.8, min_df=20)

In [None]:
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(df_temp.cleaned_comments)

In [None]:
len(tfidf_vectorizer.get_feature_names())

In [None]:
X_0 = tfidf_vectorizer_vectors[0:100000]
X_1 = tfidf_vectorizer_vectors[100001:200000]
X_2 = tfidf_vectorizer_vectors[200001:300000]
X_3 = tfidf_vectorizer_vectors[300001:400000]
X_4 = tfidf_vectorizer_vectors[400001:500000]
X_5 = tfidf_vectorizer_vectors[500001:600000]
X_6 = tfidf_vectorizer_vectors[600001:700000]
X_7 = tfidf_vectorizer_vectors[700001:800000]
X_8 = tfidf_vectorizer_vectors[800001:900000]
X_9 = tfidf_vectorizer_vectors[900001:1009583]

In [None]:
df_0=pd.DataFrame(X_0.todense())
df_1=pd.DataFrame(X_1.todense())
df_2=pd.DataFrame(X_2.todense())
df_3=pd.DataFrame(X_3.todense())
df_4=pd.DataFrame(X_4.todense())
df_5=pd.DataFrame(X_5.todense())
df_6=pd.DataFrame(X_6.todense())
df_7=pd.DataFrame(X_7.todense())
df_8=pd.DataFrame(X_8.todense())
df_9=pd.DataFrame(X_9.todense())

In [None]:
frames = [df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9]
result = pd.concat(frames)

In [None]:
result.head()