In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler


In [121]:
def add_features(X):
    X = X.copy()
    
    # Week-of-month categorical
    X["week_of_month"] = pd.cut(
        X["day"],
        bins=[0, 7, 14, 21, np.inf],
        labels=["week1", "week2", "week3", "week4"]
    )
    
    # Years since start numeric
    X["years_since_start"] = X["year"] - X["year"].min()
    
    return X


In [122]:
df=pd.read_csv("data/processed_Data.csv").dropna(how='any',axis=0).reset_index(drop=True)



In [123]:
label_attrib=["price"]
raw_attrib=["day"]
strata="year"
cat_attribs=["month","week_of_month"]
num_features=["years_since_start"]

In [124]:
def stratifiedshufflesplitter(df, bins, strata_col, test_size):
    """
    Performs stratified shuffle split based on income bins of median income value
    """
    df = df.copy()
    strata_cat = "__strata__"
    df[strata_cat] = pd.qcut(df[strata_col], q=bins, labels=False,    duplicates="drop")

    split = StratifiedShuffleSplit(
        n_splits=1,
        test_size=test_size,
        random_state=24
    )

    for train_idx, test_idx in split.split(df, df[strata_cat]):
        train = df.loc[train_idx].drop(strata_cat, axis=1)
        test = df.loc[test_idx].drop(strata_cat, axis=1)

    return train, test


In [125]:
train,test= stratifiedshufflesplitter(df,10,strata,0.2) 

In [126]:
feature_adder = FunctionTransformer(add_features)
num_pipeline=Pipeline([
    ("min-max", MinMaxScaler(feature_range=(-1, 1)))
])
cat_pipeline=Pipeline([
    ("encoding",OneHotEncoder(handle_unknown="ignore"))
])

full_pipeline = Pipeline([
    ("feature_adder", feature_adder),  # add engineered features
    ("preprocessor", ColumnTransformer([
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_attribs)
    ]))
])

In [127]:
X=full_pipeline.fit_transform(train)
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5889 stored elements and shape (1963, 17)>

In [128]:

cat_encoder = full_pipeline.named_steps["preprocessor"].named_transformers_["cat"].named_steps["encoding"]
cat_encoded_attribs = cat_encoder.get_feature_names_out(cat_attribs)
num_encoded_attribs = list(num_features) + list(cat_encoded_attribs)

# num_encoded_attribs


In [129]:
X_prepared=pd.DataFrame(X.toarray(),columns=num_encoded_attribs,index=train.index)
# X
X_prepared
# print(train)
# print(cat_attribs)
# print(num_features)

Unnamed: 0,years_since_start,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,week_of_month_week1,week_of_month_week2,week_of_month_week3,week_of_month_week4
83,0.714286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1865,-0.714286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
878,0.142857,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
868,0.142857,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
167,0.714286,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024,-0.714286,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
344,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2203,-1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
989,0.142857,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
