In [464]:
import pickle
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier as rf
# from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, train_test_split
import joblib
import pandas as pd
import numpy as np


In [465]:
with open('../src/main/app/data.pickle', 'rb') as f:
    dataset = pickle.load(f)

In [466]:
pd.options.display.max_rows = 400
boroughs = ['manhattan', 'queens', 'brooklyn', 'bronx', 'staten']
dataset['price'].loc[dataset['price'].isnull()] = '$$'
dataset = dataset.rename({'alias': 'borough'}, axis=1)
dataset['borough'].loc[dataset['borough'].str.contains('manhattan')] = 'manhattan'
dataset['borough'].loc[dataset['borough'].str.contains('queens')] = 'queens'
dataset['borough'].loc[dataset['borough'].str.contains('brooklyn')] = 'brooklyn'
dataset['borough'].loc[dataset['borough'].str.contains('bronx')] = 'bronx'
dataset['borough'].loc[dataset['borough'].str.contains('staten')] = 'staten'
dataset['borough'].loc[~dataset['borough'].str.contains('|'.join(boroughs))] = 'unknown'

dataset['latitude'] = ''
dataset['longitude'] = ''

for i in range(dataset.shape[0]):
    if dataset['transactions'].iloc[i] != None:
        dataset['transactions'].iloc[i] = sorted(dataset['transactions'].iloc[i])
    if dataset['price'].iloc[i] == '$':
        dataset['price'].iloc[i] = '1'
    elif dataset['price'].iloc[i] == '$$':
        dataset['price'].iloc[i] = '2'
    elif dataset['price'].iloc[i] == '$$$':
        dataset['price'].iloc[i] = '3'
    elif dataset['price'].iloc[i] == '$$$$':
        dataset['price'].iloc[i] = '4'

    if dataset['coordinates'].iloc[i] != None:
        dataset['latitude'].iloc[i] = f"{dataset['coordinates'].iloc[i]['latitude']}"
        dataset['longitude'].iloc[i] = f"{dataset['coordinates'].iloc[i]['longitude']}"
    categories = []
    if dataset['categories'].iloc[i] != None:
        for j in dataset['categories'].iloc[i]:
            categories.append(j['alias'])
            dataset['categories'].iloc[i] = sorted(categories)

dataset['rating'] = np.where(dataset['rating'] > 4.0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['price'].loc[dataset['price'].isnull()] = '$$'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['borough'].loc[dataset['borough'].str.contains('manhattan')] = 'manhattan'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['borough'].loc[dataset['borough'].str.contains('queens')] = 'queens'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

In [467]:
df2 = pd.DataFrame(dataset['categories'].values.tolist(), index=dataset.index)
df2.columns = ['category1', 'category2', 'category3', 'category4']
df3 = [dataset, df2]
result = pd.concat(df3, axis=1)

In [468]:
# include = ['borough', 'review_count', 'category1', 'category2', 'category3', 'category4', 'price', 'rating', 'delivery', 'pickup', 'reservations']
include = ['latitude', 'longitude', 'borough', 'category1', 'category2', 'category3', 'category4', 'price', 'rating']
df = result[include]

In [482]:
with open ('./business/data1.pickle','wb')as f:
    pickle.dump(df, f)
f.close()

In [470]:
target = df['rating']
df = df.drop(columns=['rating'])

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(df)
categorical_columns = categorical_columns_selector(df)

['latitude',
 'longitude',
 'borough',
 'category1',
 'category2',
 'category3',
 'category4',
 'price']

In [471]:
categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

In [472]:
x_train, x_test, y_train, y_test = train_test_split(df, target, random_state=42)

In [473]:
%%time
model.fit(x_train, y_train)

CPU times: total: 6.62 s
Wall time: 1.02 s


In [474]:
model.predict(x_test)
model.score(x_test, y_test)

0.7112060669053394

In [475]:
cv = cross_validate(model, df, target, cv=10)
cv


{'fit_time': array([0.91721988, 0.94599915, 1.02303004, 1.01100159, 0.91485238,
        0.89999866, 0.88899875, 0.93111396, 0.92400074, 0.89100027]),
 'score_time': array([0.11999774, 0.11999869, 0.11699963, 0.13399887, 0.11599088,
        0.1120019 , 0.11699915, 0.11499929, 0.11599946, 0.11200094]),
 'test_score': array([0.55721297, 0.50562987, 0.60023097, 0.63622366, 0.64507747,
        0.60629391, 0.64132422, 0.61649504, 0.63176131, 0.64456208])}

In [476]:
joblib.dump(model, 'model.pkl')

['model.pkl']