In [1]:
from typing import Optional
import re
import pandas as pd

In [2]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 50

In [3]:
def read_and_rename(path: str) -> Optional[pd.DataFrame]:
    try:
        df = pd.read_csv(path)
    except(FileNotFoundError):
        print("File does not exists!")
        return None
    
    #basic columns renaming
    rename_mapper = {k:re.sub("[^A-Z|_]", "", k.lower().replace(" ", "_") ,0,re.IGNORECASE) for k in df.columns}
    return df.rename(columns=rename_mapper)
    
    

### create target

In [6]:
df = read_and_rename("../bc_data/BrightonPerformanceData.csv")

geo_columns = ['latitude', 'longitude', 'zipcode', 'city']
constants = ['scraped_during_month', 'country_code', 'currency_native']
not_useful = ['property_type', 'airbnb_host_id', 'last_seen']
cols_to_drop = geo_columns + constants + not_useful
df = df.drop(cols_to_drop, axis=1)


In [7]:

df.sort_values(by=['airbnb_property_id', 'reporting_month'], inplace=True)

# Shift the reporting_month column by one row for each airbnb_property_id
df['next_reporting_month'] = df.groupby('airbnb_property_id')['reporting_month'].shift(-1)

# Create a new column that is True if the next month's row exists for that airbnb_property_id
df['target'] = ~df['next_reporting_month'].isnull()

df  = df[df['reporting_month'] != '2023-10-01'].drop(["next_reporting_month"], axis=1)


In [35]:
# df['event_timestamp'] = pd.datetime(df.reporting_month)
df['event_timestamp'] = pd.to_datetime(df['reporting_month'])

In [36]:
data_df1 = df[['airbnb_property_id','event_timestamp'] + ['bedrooms', 'bathrooms']]
data_df2 = df[['airbnb_property_id','event_timestamp'] + ['blocked_days', 'available_days', 'occupancy_rate', 'reservation_days']]

target_df = df[['airbnb_property_id', 'target','event_timestamp']]

# Creating timestamps for the data
# timestamps = pd.date_range(
#     end=pd.Timestamp.now(), 
#     periods=len(df), 
#     freq='D').to_frame(name="event_timestamp", index=False)

# # Adding the timestamp column to each DataFrame
# target_df = pd.concat(objs=[target_df, timestamps], axis=1)
# data_df1 = pd.concat(objs=[data_df1, timestamps], axis=1)
# data_df2 = pd.concat(objs=[data_df2, timestamps], axis=1)

In [37]:
import os
DATA_DIR ="src/feast/airbnb/data"

In [41]:
data_df1.to_parquet(path=os.path.join(DATA_DIR, 'data_df1.parquet'))
data_df2.to_parquet(path=os.path.join(DATA_DIR, 'data_df2.parquet'))
target_df.to_parquet(path=os.path.join(DATA_DIR, 'target_df.parquet'))

In [13]:
training_df = df[keep_cols]
training_df['bedrooms'] = training_df['bedrooms'].replace("Studio", "1").astype("int64")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['bedrooms'] = training_df['bedrooms'].replace("Studio", "1").astype("int64")


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

def create_model():
    class PreprocessDF():
        def __init__(self):
            self.encoders = {
                'sex': OrdinalEncoder(),
                'job': OrdinalEncoder(),
                'car_type': OrdinalEncoder()
            }
            # ensure the order and needed columns
            self.needed_columns = ['bedrooms', 'bathrooms',
       'cleaning_fee', 'blocked_days', 'available_days',
       'occupancy_rate', 'reservation_days', 'adr_usd', 'adr_native',
       'number_of_reservation', 'revenue_usd', 'revenue_native']
    
        def fit(self, df, y = None):
            for column in df.columns:
                if column in self.encoders:
                    self.encoders[column].fit(df[[column]])
            return self

        def transform(self, input_df, y = None):
            df = input_df.copy() # creating a copy to avoid changes to original dataset
            # for column in self.encoders:
            #     df[column] = self.encoders[column].transform(df[[column]])
            return df[self.needed_columns].astype('float32')
        
    # it guarantees that model and preprocessing needed are always togheter
    model = Pipeline(steps=[
            ('preprocess', PreprocessDF()),
            ('classifier', RandomForestClassifier())
        ])
    
    search_params = {'classifier__criterion':['gini'], 'classifier__max_depth':[50, 100], 'classifier__n_estimators': [10, 80]}
    # best model with f1, other metrics are only monitored
    clf = GridSearchCV(model, search_params, scoring=['f1', 'accuracy', 'balanced_accuracy', 'precision', 'recall', 'roc_auc'], refit='f1', cv=3)
    return clf

In [22]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.model_selection import GridSearchCV

search_params = {'classifier__criterion':['gini'], 'classifier__max_depth':[50, 100], 'classifier__n_estimators': [10, 80]}
model = RandomForestClassifier()
clf = create_model()
import tempfile
import warnings


In [25]:

import os
experiment_name = 'drivers'
existing_exp = mlflow.get_experiment_by_name(experiment_name)
if not existing_exp:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = existing_exp.experiment_id

timestamp = datetime.now().isoformat().split(".")[0].replace(":", ".")
with mlflow.start_run(experiment_id=experiment_id, run_name=timestamp) as run:
    clf.fit(training_df, training_df['target'])
    cv_results = clf.cv_results_
    best_index = clf.best_index_
    for score_name in [score for score in cv_results if "mean_test" in score]:
        mlflow.log_metric(score_name, cv_results[score_name][best_index])
        mlflow.log_metric(score_name.replace("mean","std"), cv_results[score_name.replace("mean","std")][best_index])

    tempdir = tempfile.TemporaryDirectory().name
    os.mkdir(tempdir)
    filename = "%s-%s-cv_results.csv" % ('RandomForest', timestamp)
    csv = os.path.join(tempdir, filename)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pd.DataFrame(cv_results).to_csv(csv, index=False)
    
    mlflow.log_artifact(csv, "cv_results")