In [1]:
from typing import Optional
import re
import pandas as pd

In [2]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 50

In [3]:
def read_and_rename(path: str) -> Optional[pd.DataFrame]:
    try:
        df = pd.read_csv(path)
    except(FileNotFoundError):
        print("File does not exists!")
        return None
    
    #basic columns renaming
    rename_mapper = {k:re.sub("[^A-Z|_]", "", k.lower().replace(" ", "_") ,0,re.IGNORECASE) for k in df.columns}
    return df.rename(columns=rename_mapper)
    
    

### create target

In [4]:
df = read_and_rename("BrightonPerformanceData.csv")

geo_columns = ['latitude', 'longitude', 'zipcode', 'city']
constants = ['scraped_during_month', 'country_code', 'currency_native']
not_useful = ['property_type', 'airbnb_host_id', 'last_seen']
cols_to_drop = geo_columns + constants + not_useful
df = df.drop(cols_to_drop, axis=1)


In [5]:

df.sort_values(by=['airbnb_property_id', 'reporting_month'], inplace=True)

# Shift the reporting_month column by one row for each airbnb_property_id
df['next_reporting_month'] = df.groupby('airbnb_property_id')['reporting_month'].shift(-1)

# Create a new column that is True if the next month's row exists for that airbnb_property_id
df['target'] = ~df['next_reporting_month'].isnull()

df  = df[df['reporting_month'] != '2023-10-01'].drop(["reporting_month","next_reporting_month"], axis=1)


In [6]:
df.target.value_counts()

target
True     20921
False     3928
Name: count, dtype: int64

In [7]:
df.shape

(24849, 15)

In [8]:
data_df1 = df[df.columns[:5]]
data_df2 = df[df.columns[5:10]]
data_df3 = df[df.columns[10:14]]

target_df = pd.DataFrame(data=df.target, columns=["target"])

# Creating timestamps for the data
timestamps = pd.date_range(
    end=pd.Timestamp.now(), 
    periods=len(df), 
    freq='D').to_frame(name="event_timestamp", index=False)

# Adding the timestamp column to each DataFrame
data_df1 = pd.concat(objs=[data_df1, timestamps], axis=1)
data_df2 = pd.concat(objs=[data_df2, timestamps], axis=1)
data_df3 = pd.concat(objs=[data_df3, timestamps], axis=1)

In [9]:
import os
DATA_DIR ="src/feast/airbnb/data"

In [10]:
import pyarrow

In [16]:
data_df2.columns


'blocked_days'
'available_days'
'occupancy_rate'
'reservation_days'

'adr_usd'
'event_timestamp'

Index(['blocked_days', 'available_days', 'occupancy_rate', 'reservation_days',
       'adr_usd', 'event_timestamp'],
      dtype='object')

In [12]:
data_df1.to_parquet(path=os.path.join(DATA_DIR, 'data_df1.parquet'), engine='pyarrow')
data_df2.to_parquet(path=os.path.join(DATA_DIR, 'data_df2.parquet'))
data_df3.to_parquet(path=os.path.join(DATA_DIR, 'data_df3.parquet'))
target_df.to_parquet(path=os.path.join(DATA_DIR, 'target_df.parquet'))

In [15]:
from feast import ValueType
ValueType.STRING

<ValueType.STRING: 2>