In [24]:
from typing import Optional
import re
import pandas as pd

In [25]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 50

In [32]:
def read_and_rename(path: str) -> Optional[pd.DataFrame]:
    try:
        df = pd.read_csv(path)
    except(FileNotFoundError):
        print("File does not exists!")
        return None
    
    #basic columns renaming
    rename_mapper = {k:re.sub("[^A-Z|_]", "", k.lower().replace(" ", "_") ,0,re.IGNORECASE) for k in df.columns}
    return df.rename(columns=rename_mapper)
    
    

### create target

In [33]:
df = read_and_rename("BrightonPerformanceData.csv")

geo_columns = ['latitude', 'longitude', 'zipcode', 'city']
constants = ['scraped_during_month', 'country_code', 'currency_native']
not_useful = ['property_type', 'airbnb_host_id', 'last_seen']
cols_to_drop = geo_columns + constants + not_useful
df = df.drop(cols_to_drop, axis=1)


In [34]:

df.sort_values(by=['airbnb_property_id', 'reporting_month'], inplace=True)

# Shift the reporting_month column by one row for each airbnb_property_id
df['next_reporting_month'] = df.groupby('airbnb_property_id')['reporting_month'].shift(-1)

# Create a new column that is True if the next month's row exists for that airbnb_property_id
df['target'] = ~df['next_reporting_month'].isnull()

df  = df[df['reporting_month'] != '2023-10-01'].drop(["next_reporting_month"], axis=1)


In [35]:
# df['event_timestamp'] = pd.datetime(df.reporting_month)
df['event_timestamp'] = pd.to_datetime(df['reporting_month'])

In [36]:
data_df1 = df[['airbnb_property_id','event_timestamp'] + ['bedrooms', 'bathrooms']]
data_df2 = df[['airbnb_property_id','event_timestamp'] + ['blocked_days', 'available_days', 'occupancy_rate', 'reservation_days']]

target_df = df[['airbnb_property_id', 'target','event_timestamp']]

# Creating timestamps for the data
# timestamps = pd.date_range(
#     end=pd.Timestamp.now(), 
#     periods=len(df), 
#     freq='D').to_frame(name="event_timestamp", index=False)

# # Adding the timestamp column to each DataFrame
# target_df = pd.concat(objs=[target_df, timestamps], axis=1)
# data_df1 = pd.concat(objs=[data_df1, timestamps], axis=1)
# data_df2 = pd.concat(objs=[data_df2, timestamps], axis=1)

In [37]:
import os
DATA_DIR ="src/feast/airbnb/data"

In [38]:
import pyarrow

In [39]:
data_df1

Unnamed: 0,airbnb_property_id,event_timestamp,bedrooms,bathrooms
23507,74819,2022-11-01,2,2
22179,74819,2022-12-01,2,2
20740,74819,2023-01-01,2,2
19052,74819,2023-02-01,2,2
17150,74819,2023-03-01,2,2
...,...,...,...,...
4991,985326724713973239,2023-09-01,2,2
2316,985326724713973239,2023-10-01,2,2
4992,985389662447322939,2023-09-01,1,1
2317,985389662447322939,2023-10-01,1,1


In [40]:
target_df

Unnamed: 0,airbnb_property_id,target,event_timestamp
23507,74819,True,2022-11-01
22179,74819,True,2022-12-01
20740,74819,True,2023-01-01
19052,74819,True,2023-02-01
17150,74819,True,2023-03-01
...,...,...,...
4991,985326724713973239,True,2023-09-01
2316,985326724713973239,False,2023-10-01
4992,985389662447322939,True,2023-09-01
2317,985389662447322939,False,2023-10-01


In [41]:
data_df1.to_parquet(path=os.path.join(DATA_DIR, 'data_df1.parquet'), engine='pyarrow')
data_df2.to_parquet(path=os.path.join(DATA_DIR, 'data_df2.parquet'))
target_df.to_parquet(path=os.path.join(DATA_DIR, 'target_df.parquet'))