In [2]:
import pandas as pd
import numpy as np
import openpyxl
import re
from re import sub
from decimal import Decimal
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from numpy import unique, where
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.metrics import r2_score, silhouette_samples, silhouette_score , completeness_score , homogeneity_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from IPython.display import display, HTML, display_html
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


In [3]:
from Generating_Feature_Importance_tuning_PCA_model import PCA_My_Airbnb_Capstone_Model, PCA_pre_processing_pipeline
from Generating_Feature_Importance_tuning_raw_feature_model import My_Airbnb_Capstone_Model, pre_processing_pipeline
from scipy.stats import spearmanr
def spearmanr_score(truth, pred):
        return spearmanr(truth, pred)[0]
model = pickle.load(open('./Feature_Importance_tuning_raw_feature_model.pkl','rb'))




In [4]:
model.grid_search_dict[0.5].best_params_

{'model__learning_rate': 0.2, 'model__max_depth': 11, 'model__num_leaves': 60}

In [5]:
feature_importance_dict = {}
for q in list(model.grid_search_dict.keys()):
    feature_importance_dict[q] = sorted(dict(
        zip(
            model.grid_search_dict[q].best_estimator_['model'].feature_name_,
            model.grid_search_dict[q].best_estimator_['model'].feature_importances_
        )
    ).items(),
        key=lambda x:-x[1])
    

In [15]:
feature_importance_dict

{0.05: [('Location_mean_area_accommodates_price', 87),
  ('Amenities_amenities_count', 73),
  ('Amenities_minimum_nights', 70),
  ('Amenities_property_type_code', 61),
  ('Amenities_bathrooms_count', 36),
  ('Location_mean_area_beds_price', 32),
  ('Amenities_maximum_nights', 30),
  ('Amenities_room_type_code', 28),
  ('NLP_관광을_원하시면_믿을수있는_여행사를', 27),
  ('Amenities_bedrooms', 23),
  ('Amenities_beds', 20),
  ('Location_transport_most_close_dis', 19),
  ('Location_real_estate', 19),
  ('Location_Los_Angeles_International_Airport', 18),
  ('NLP_s163', 16),
  ('NLP_s0', 15),
  ('Amenities_pool', 14),
  ('Location_Angeles_Stadium', 14),
  ('NLP_s23', 14),
  ('NLP_s196', 14),
  ('NLP_s232', 14),
  ('NLP_s250', 14),
  ('Amenities_bathrooms_type_code', 13),
  ('Location_transport_1000m_num', 13),
  ('Location_Santa_Monica_Pier', 13),
  ('Location_Dodgers_Stadium', 13),
  ('NLP_하실수', 13),
  ('NLP_s182', 13),
  ('NLP_s204', 13),
  ('NLP_s16', 12),
  ('NLP_s180', 12),
  ('NLP_s268', 12),
  ('NLP_

In [7]:
pickle.dump(feature_importance_dict, open('Feature_importance_best_models_tuned_raw_features.pkl', 'wb'))

In [9]:
cv_results = pickle.load(open('Feature_Importance_tuning_raw_feature_cv_results.pkl','rb'))

In [10]:
cv_list = []
for q in list(model.grid_search_dict.keys()):
    df = pd.DataFrame(model.grid_search_dict[q].cv_results_)
    df['quantile'] = q
    cv_list.append(df)
cv_df = pd.concat(cv_list)

In [11]:
cv_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__learning_rate,...,split2_test_neg_mean_squared_error,mean_test_neg_mean_squared_error,std_test_neg_mean_squared_error,rank_test_neg_mean_squared_error,quantile
0,96.965366,1.187508,35.444568,0.862262,0.05,...,-15068.820462,-15012.164474,112.474696,39,0.05
1,91.380604,2.124902,32.490757,2.791599,0.05,...,-14826.992277,-14941.204262,144.551669,36,0.05
2,85.670443,3.577855,33.182628,1.072194,0.05,...,-14826.992277,-14941.204262,144.551669,36,0.05
3,86.535251,1.280136,33.661026,0.607365,0.05,...,-14826.992277,-14941.204262,144.551669,36,0.05
4,96.820449,1.267528,33.377338,0.662668,0.05,...,-15349.240581,-15514.453015,153.907455,43,0.05
...,...,...,...,...,...,...,...,...,...,...,...
43,95.473408,2.185573,33.632637,0.568856,0.2,...,-14073.124719,-14309.824805,356.426658,2,0.95
44,92.551192,5.092576,32.180233,1.605049,0.2,...,-15157.140086,-14919.890465,277.370680,7,0.95
45,90.277786,2.576923,32.261571,2.913026,0.2,...,-15015.897146,-14751.274797,200.399995,5,0.95
46,92.202851,5.222686,33.511101,2.697780,0.2,...,-14829.933953,-14433.931411,310.110076,3,0.95


In [12]:
cv_df.sort_values(by=['quantile','mean_test_neg_mean_squared_error'], ascending=[True,False])\
    .to_csv('GridSearch_results_raw_data.csv',index=False)
    

In [13]:
len(model.grid_search_dict[0.5].best_estimator_['model'].feature_name_), len(model.grid_search_dict[0.5].best_estimator_['model'].feature_importances_)


(1810, 1810)

In [3]:
df = pd.read_csv('../../Data/LA_Airbnb/listings_detailed.csv')

In [4]:
prohibitedWords = ['private', 'shared', ' in ', 'entire', 'room'] # added spaces in front and behind 'in' to ensure accuracy
big_regex = re.compile('|'.join(map(re.escape, prohibitedWords)))
df['property_type_clean'] =  df['property_type'].apply(lambda x: big_regex.sub("", x).strip())
df['property_type_clean'] = np.where(df['property_type_clean']=='',df['room_type'],df['property_type_clean'])


In [52]:
from collections import Counter
a=Counter()
a.update(df['property_type_clean'].values)
sorted_property_types = sorted(a.items(), key=lambda x:-x[1])
all_sum = np.nansum([i[1] for i in sorted_property_types])
sorted_property_types = [(a,float(b)/all_sum) for a,b in sorted_property_types]
pickle.dump(sorted_property_types, open(f'./utiles/sorted_property_types.pkl','wb'))


In [57]:
a=Counter()
a.update(df['room_type'].values)
sorted_room_types = sorted(a.items(), key=lambda x:-x[1])
all_sum = np.nansum([i[1] for i in sorted_room_types])
sorted_room_types = [(a,float(b)/all_sum) for a,b in sorted_room_types]
pickle.dump(sorted_room_types, open(f'./utiles/sorted_room_types.pkl','wb'))

In [58]:
a_u = open(r"./utiles/amenities_universe.txt", "r",encoding='unicode escape').read().split('\n')
bb = pd.read_csv('./final_features/LA_extracted_all_features_imputed.csv')
bb = bb[[i for i in bb.columns if 'Amenities' in i]]
bb.columns = [i.split('Amenities_')[1] for i in bb.columns]
bb = sorted(
    dict(bb[[i for i in bb.columns if i in a_u]].sum(axis=0)).items(),key=lambda x:-x[1]
)
all_sum = np.nansum([i[1] for i in bb])
bb = [(a,float(b)/all_sum) for a,b in bb]
pickle.dump(bb, open(f'./utiles/sorted_amenities.pkl','wb'))
