## Predicting Airbnb Listing Price | Analysis

In [53]:
import warnings
import numpy as np
import pandas as pd
import multiprocessing as mp
import xgboost as xgb

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
from tqdm import tqdm

In [2]:
SEED = 1
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")
cores = mp.cpu_count()

In [3]:
directory = '/Users/limesncoconuts2/datasets/airbnb/'
df = pd.read_csv(directory + 'df_clean.csv', nrows=1000)

In [4]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 353 entries, accommodates to reviews
dtypes: bool(259), float64(21), int64(72), object(1)
memory usage: 17.5 MB


## Feature Engineering

In [5]:
df.reviews = df.reviews.str.replace("[","").str.replace("]","") \
                    .str.replace("\\","").str.replace("\"","") \
                    .str.replace("\'","").str.replace("rn","").str.strip()

In [6]:
def get_review_sentiment(review, analyzer):

    try:
        score = analyzer.polarity_scores(review)
        score = score['pos']
    
    except: # if nan
        score = 0
    
    return score

In [7]:
%%time
analyzer = SentimentIntensityAnalyzer()

pool = mp.Pool(cores)
results = [pool.apply(get_review_sentiment, args=(row, analyzer)) for row in df.reviews]
pool.close()    

df['pos_score'] = results

CPU times: user 3.75 s, sys: 1.39 s, total: 5.13 s
Wall time: 15min 27s


In [8]:
df_sorted = df[['reviews', 'pos_score']].dropna().sort_values(by=['pos_score'], ascending=False)

In [9]:
most_pos = list(df_sorted.head(5).reviews.values)
print("Most positive reviews:")
print("-----------------------")
print("")
for i in most_pos:
    print(i)
    print("")

Most positive reviews:
-----------------------

Great

Perfect 👌🏼

Marcy was great very sweet!

Amazing spectacular experience , highly recommend

Awesome host. Great communication and accommodation skills. Made sure I was taking care of.



In [13]:
least_pos = df_sorted.tail(20).reviews.values
print("Least positive reviews:")
print("-----------------------")
print("")
for i in least_pos:
    print(i)
    print("")

Least positive reviews:
-----------------------



The host canceled this reservation the day before arrival. This is an automated posting.

房子在半山腰,风景很不错。房间有独立卫生间,床也比较舒服。有车很方便,吃饭8分钟就可以到。, Las fotografías se quedan cortas con lo bello que es el apartamento, realmente es hermoso.nLas atenciones y servicios son los indicados para tener una estancia cómoda y agradable.nA pesar de que es la renta de la habitación en un hogar compartido, tienes privacidad :)nnEncantador lugar, te lo recomiendo al 100% y aconsejable la renta de un coche porque está en colina., 房子很好很漂亮，大大的落地窗视野很好.....房东很好, 房子位于东部Monterey park，华人聚集区，在半山坡上，阳台的景观很不错。除了洛杉矶的交通堵塞以外，住的挺舒服的。, 房子在半山腰，风景很不错。房间有独立卫生间，床也比较舒服。有车很方便，吃饭8分钟就可以到。, 房东会说中文，方便沟通。及时回复信息。地点准确，方便停车。房东人很好，会解答我们一切的信息。并非常有帮助。可以使用冰箱和厨具。并尊重房房客。

非常安静美丽的街区，主人态度特别好，也很热情提供帮助。房子设备齐全干净，感觉很温馨。院子很漂亮，还有各种植物。孩子们很喜欢。

建物は古いけどとても便利な場所でお部屋は広く、トイレットペーパーや洗剤など揃っていて色々と行き届いた家でした。それにDanielleさんはとても素晴らしいホストです。質問した事は全てすぐに答えてくれて問題は何もありませんでした。チェックインやガレージも分かりやすく説明がありました。素晴らしい時間が過ごせて感謝しています。

图片

## Modeling


In [14]:
X = df.drop(columns=['price_USD', 'reviews'])
y = df.price_USD

In [None]:
'''
X_sm = X.sample(frac=0.1, replace=False, random_state=SEED)
y_sm = y.sample(frac=0.1, replace=False, random_state=SEED)
'''

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=SEED)

In [21]:
rf = RandomForestRegressor(n_estimators=800,
                          max_depth=5,
                          min_samples_leaf=0.1,
                          n_jobs=-1,
                          random_state=SEED)

In [22]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=0.1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=-1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [23]:
y_pred = rf.predict(X_test)

In [36]:
rmse_test = mean_squared_error(y_test, y_pred)**(1/2)
rmse_test

76.15139525970618

---

In [47]:
gbt = GradientBoostingRegressor(n_estimators=800,
                               max_depth=7,
                               random_state=SEED)

In [48]:
gbt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=800, n_iter_no_change=None, presort='auto',
             random_state=1, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
y_pred = gbt.predict(X_test)

In [50]:
rmse_test = mean_squared_error(y_test, y_pred)**(1/2)
rmse_test

39.73126167986596

---

In [116]:
xg_reg = xgb.XGBRegressor(n_jobs=cores, random_state=SEED)

In [None]:
gbm_param_grid = {
    'xgb_model__subsample': ,
    'xgb_model__max_depth': ,
    'xgb_model__colsample_bytree': ,
    'xgb_model__gamma': 
}

In [None]:
randomized_neg_mse = RandomizedSearchCV(estimator=xgb_reg,
                                       params_distributions=gbm_param_grid,
                                       n_iter=10,
                                       scoring='neg_mean_squared_error',
                                       cv=4)

In [None]:
randomized_neg_mse.fit(X_train, y_train)

In [None]:
print("Best rmse: ", 
      np.sqrt(np.abs(randomized_neg_mse.best.score_)))

print("Best model: ",
     randomized_neg_mse.best_estimator_)

### AUC/ROC and Precision/Recall