## Predicting Airbnb Listing Price | Analysis

In [1]:
import warnings
import numpy as np
import pandas as pd
import multiprocessing as mp
import xgboost as xgb

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [2]:
SEED = 1
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")
cores = mp.cpu_count()

In [3]:
directory = '/Users/limesncoconuts2/datasets/airbnb/'
df = pd.read_csv(directory + 'df_clean.csv', nrows=1000)

In [4]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 353 entries, accommodates to reviews
dtypes: bool(259), float64(21), int64(72), object(1)
memory usage: 17.5 MB


## Feature Engineering

In [5]:
df.reviews = df.reviews.str.replace("[","").str.replace("]","") \
                    .str.replace("\\","").str.replace("\"","") \
                    .str.replace("\'","").str.replace("rn","").str.strip()

In [6]:
def get_review_sentiment(review, analyzer):

    try:
        score = analyzer.polarity_scores(review)
        score = score['pos']
    
    except: # if nan
        score = 0
    
    return score

In [None]:
%%time
analyzer = SentimentIntensityAnalyzer()

pool = mp.Pool(cores)
results = [pool.apply(get_review_sentiment, args=(row, analyzer)) for row in df.reviews]
pool.close()    

df['pos_score'] = results

In [None]:
df_sorted = df[['reviews', 'pos_score']].dropna().sort_values(by=['pos_score'], ascending=False)

In [None]:
most_pos = list(df_sorted.head(5).reviews.values)
print("Most positive reviews:")
print("-----------------------")
print("")
for i in most_pos:
    print(i)
    print("")

In [None]:
least_pos = df_sorted.tail(5).reviews.values
print("Least positive reviews:")
print("-----------------------")
print("")
for i in least_pos:
    print(i)
    print("")

## Modeling


In [None]:
X = df.drop(columns=['price_USD', 'reviews'])
y = df.price_USD

In [None]:
'''
X_sm = X.sample(frac=0.1, replace=False, random_state=SEED)
y_sm = y.sample(frac=0.1, replace=False, random_state=SEED)
'''

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=SEED)

In [None]:
rf = RandomForestRegressor(n_estimators=800,
                          max_depth=5,
                          min_samples_leaf=0.1,
                          n_jobs=-1,
                          random_state=SEED)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
rf.score(X_test, y_test)

---

In [116]:
xg_reg = xgb.XGBRegressor(n_jobs=cores, random_state=SEED)

In [117]:
gbm_param_grid = {
    'xgb_model__subsample': ,
    'xgb_model__max_depth': ,
    'xgb_model__colsample_bytree': ,
    'xgb_model__gamma': 
}

SyntaxError: invalid syntax (<ipython-input-117-fb76b1cce224>, line 2)

In [None]:
randomized_neg_mse = RandomizedSearchCV(estimator=xgb_reg,
                                       params_distributions=gbm_param_grid,
                                       n_iter=10,
                                       scoring='neg_mean_squared_error',
                                       cv=4)

In [None]:
randomized_neg_mse.fit(X_train, y_train)

In [None]:
print("Best rmse: ", 
      np.sqrt(np.abs(randomized_neg_mse.best.score_)))

print("Best model: ",
     randomized_neg_mse.best_estimator_)

### AUC/ROC and Precision/Recall