In [1]:
from typing import Optional
import re
import pandas as pd

In [2]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 50

In [3]:
def read_and_rename(path: str) -> Optional[pd.DataFrame]:
    try:
        df = pd.read_csv(path)
    except(FileNotFoundError):
        print("File does not exists!")
        return None
    
    #basic columns renaming
    rename_mapper = {k:re.sub("[^A-Z|_]", "", k.lower().replace(" ", "_") ,0,re.IGNORECASE) for k in df.columns}
    return df.rename(columns=rename_mapper)
    
    

### create target

In [4]:
df = read_and_rename("../bc_data/BrightonPerformanceData.csv")

geo_columns = ['latitude', 'longitude', 'zipcode', 'city']
constants = ['scraped_during_month', 'country_code', 'currency_native']
not_useful = ['property_type', 'airbnb_host_id', 'last_seen']
cols_to_drop = geo_columns + constants + not_useful
df = df.drop(cols_to_drop, axis=1)


In [5]:

df.sort_values(by=['airbnb_property_id', 'reporting_month'], inplace=True)

# Shift the reporting_month column by one row for each airbnb_property_id
df['next_reporting_month'] = df.groupby('airbnb_property_id')['reporting_month'].shift(-1)

# Create a new column that is True if the next month's row exists for that airbnb_property_id
df['target'] = ~df['next_reporting_month'].isnull()

df  = df[df['reporting_month'] != '2023-10-01'].drop(["next_reporting_month"], axis=1)


In [10]:
df

Unnamed: 0,listing_type,bedrooms,bathrooms,airbnb_property_id,cleaning_fee,reporting_month,blocked_days,available_days,occupancy_rate,reservation_days,adr_usd,adr_native,number_of_reservation,revenue_usd,revenue_native,target,event_timestamp
23507,entire_home,2,2,74819,46.0,2022-11,1,30,52.0,13,229,181,8,3345,2643,True,2022-11-01
22179,entire_home,2,2,74819,46.0,2022-12,0,31,61.9,16,279,220,9,4878,3854,True,2022-12-01
20740,entire_home,2,2,74819,46.0,2023-01,0,31,100.0,31,255,201,5,8135,6427,True,2023-01-01
19052,entire_home,2,2,74819,46.0,2023-02,3,28,100.0,28,225,178,5,6530,5159,True,2023-02-01
17150,entire_home,2,2,74819,46.0,2023-03,4,27,100.0,24,230,182,8,5888,4652,True,2023-03-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4991,entire_home,2,2,985326724713973239,,2023-09,28,3,80.0,2,146,115,0,292,231,True,2023-09-01
2316,entire_home,2,2,985326724713973239,,2023-10,0,31,54.2,14,147,116,3,2058,1626,False,2023-10-01
4992,entire_home,1,1,985389662447322939,,2023-09,28,3,80.0,2,238,188,0,476,376,True,2023-09-01
2317,entire_home,1,1,985389662447322939,,2023-10,0,31,31.0,8,160,126,3,1280,1011,False,2023-10-01


In [6]:
# df['event_timestamp'] = pd.datetime(df.reporting_month)
df['event_timestamp'] = pd.to_datetime(df['reporting_month'])

In [7]:
data_df1 = df[['airbnb_property_id','event_timestamp'] + ['bedrooms', 'bathrooms']]
data_df2 = df[['airbnb_property_id','event_timestamp'] + ['blocked_days', 'available_days', 'occupancy_rate', 'reservation_days']]

target_df = df[['airbnb_property_id', 'target','event_timestamp']]

# Creating timestamps for the data
# timestamps = pd.date_range(
#     end=pd.Timestamp.now(), 
#     periods=len(df), 
#     freq='D').to_frame(name="event_timestamp", index=False)

# # Adding the timestamp column to each DataFrame
# target_df = pd.concat(objs=[target_df, timestamps], axis=1)
# data_df1 = pd.concat(objs=[data_df1, timestamps], axis=1)
# data_df2 = pd.concat(objs=[data_df2, timestamps], axis=1)

In [8]:
import os
DATA_DIR ="src/feast/airbnb/data"

In [41]:
data_df1.to_parquet(path=os.path.join(DATA_DIR, 'data_df1.parquet'))
data_df2.to_parquet(path=os.path.join(DATA_DIR, 'data_df2.parquet'))
target_df.to_parquet(path=os.path.join(DATA_DIR, 'target_df.parquet'))

In [31]:
import sys,os

filename = sys.argv[0]

In [32]:
filename

'/home/gianmaria/repos/airbnb-bc/bcenv/lib/python3.10/site-packages/ipykernel_launcher.py'

In [30]:
os.path.dirname(os.getcwd())

'/home/gianmaria/repos/airbnb-bc'

In [11]:
training_df = df.copy()
data_df1['bedrooms'] = data_df1['bedrooms'].replace("Studio", "1").astype("int64")


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

def create_model():
    class PreprocessDF():
        def __init__(self):
            self.encoders = {
                'sex': OrdinalEncoder(),
                'job': OrdinalEncoder(),
                'car_type': OrdinalEncoder()
            }
            # ensure the order and needed columns
            self.needed_columns = ['bedrooms', 'bathrooms',
       'cleaning_fee', 'blocked_days', 'available_days',
       'occupancy_rate', 'reservation_days', 'adr_usd', 'adr_native',
       'number_of_reservation', 'revenue_usd', 'revenue_native']
    
        def fit(self, df, y = None):
            for column in df.columns:
                if column in self.encoders:
                    self.encoders[column].fit(df[[column]])
            return self

        def transform(self, input_df, y = None):
            df = input_df.copy() # creating a copy to avoid changes to original dataset
            # for column in self.encoders:
            #     df[column] = self.encoders[column].transform(df[[column]])
            return df[self.needed_columns].astype('float32')
        
    # it guarantees that model and preprocessing needed are always togheter
    model = Pipeline(steps=[
            ('preprocess', PreprocessDF()),
            ('classifier', RandomForestClassifier())
        ])
    
    search_params = {'classifier__criterion':['gini'], 'classifier__max_depth':[50, 100], 'classifier__n_estimators': [10, 80]}
    # best model with f1, other metrics are only monitored
    clf = GridSearchCV(model, search_params, scoring=['f1', 'accuracy', 'balanced_accuracy', 'precision', 'recall', 'roc_auc'], refit='f1', cv=3)
    return clf

In [13]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.model_selection import GridSearchCV

search_params = {'classifier__criterion':['gini'], 'classifier__max_depth':[50, 100], 'classifier__n_estimators': [10, 80]}
clf = create_model()
import tempfile
import warnings


In [16]:

import os
experiment_name = 'drivers'
existing_exp = mlflow.get_experiment_by_name(experiment_name)
if not existing_exp:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = existing_exp.experiment_id

timestamp = datetime.now().isoformat().split(".")[0].replace(":", ".")
with mlflow.start_run(experiment_id=experiment_id, run_name=timestamp) as run:
    clf.fit(training_df, training_df['target'])
    cv_results = clf.cv_results_
    best_index = clf.best_index_
    for score_name in [score for score in cv_results if "mean_test" in score]:
        mlflow.log_metric(score_name, cv_results[score_name][best_index])
        mlflow.log_metric(score_name.replace("mean","std"), cv_results[score_name.replace("mean","std")][best_index])

    tempdir = tempfile.TemporaryDirectory().name
    os.mkdir(tempdir)
    filename = "%s-%s-cv_results.csv" % ('RandomForest', timestamp)
    csv = os.path.join(tempdir, filename)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pd.DataFrame(cv_results).to_csv(csv, index=False)
    
    mlflow.log_artifact(csv, "cv_results")

In [19]:
import mlflow.sklearn
from mlflow.models import infer_signature

with mlflow.start_run() as run:
    
    clf.fit(training_df, training_df['target'])
    # Infer the model signature
    y_pred = clf.predict(training_df)
    signature = infer_signature(training_df, y_pred)

    # Log parameters and metrics using the MLflow APIs
    # mlflow.log_params(params)
    cv_results = clf.cv_results_
    best_index = clf.best_index_
    for score_name in [score for score in cv_results if "mean_test" in score]:
        mlflow.log_metric(score_name, cv_results[score_name][best_index])
        mlflow.log_metric(score_name.replace("mean","std"), cv_results[score_name.replace("mean","std")][best_index])

    # Log the sklearn model and register as version 1
    mlflow.sklearn.log_model(
        sk_model=clf.best_estimator_,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="sk-learn-random-forest-clf-model",
    )

Successfully registered model 'sk-learn-random-forest-clf-model'.
Created version '1' of model 'sk-learn-random-forest-clf-model'.


In [21]:
from mlflow import MlflowClient

client = MlflowClient()

# create "champion" alias for version 1 of model "example-model"
client.set_registered_model_alias("sk-learn-random-forest-clf-model", "champion", 1)

# reassign the "Champion" alias to version 2
# client.set_registered_model_alias("example-model", "Champion", 2)

# get a model version by alias
client.get_model_version_by_alias("sk-learn-random-forest-clf-model", "champion")

<ModelVersion: aliases=['champion'], creation_timestamp=1716395698332, current_stage='None', description=None, last_updated_timestamp=1716395698332, name='sk-learn-random-forest-clf-model', run_id='e613a1e7c2e44dc5b462a6f1b71c9429', run_link=None, source='file:///home/gianmaria/repos/airbnb-bc/notebooks/mlruns/0/e613a1e7c2e44dc5b462a6f1b71c9429/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [22]:
import mlflow.pyfunc

model_name = "sk-learn-random-forest-clf-model"
model_version = 1

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

model.predict(training_df)

array([ True,  True,  True, ...,  True, False, False])