In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeRegressor
import os

from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
import joblib

In [10]:
def add_features(X):
    X = X.copy()
    
    # Week-of-month categorical
    X["week_of_month"] = pd.cut(
        X["day"],
        bins=[0, 7, 14, 21, np.inf],
        labels=["week1", "week2", "week3", "week4"]
    )
    
    # Years since start numeric
    X["years_since_start"] = X["year"] - X["year"].min()
    
    return X
def stratifiedshufflesplitter(df, bins, strata_col, test_size):
    """
    Performs stratified shuffle split based on income bins of median income value
    """
    df = df.copy()
    strata_cat = "__strata__"
    df[strata_cat] = pd.qcut(df[strata_col], q=bins, labels=False,duplicates="drop")

    split = StratifiedShuffleSplit(
        n_splits=1,
        test_size=test_size,
        random_state=42
    )

    for train_idx, test_idx in split.split(df, df[strata_cat]):
        train = df.loc[train_idx].drop(strata_cat, axis=1)
        test = df.loc[test_idx].drop(strata_cat, axis=1)

    return train, test
def build_pipeline(cat_attribs, num_attribs):
    feature_adder = FunctionTransformer(add_features)

    cat_pipeline = Pipeline([
        ("encoding", OneHotEncoder(handle_unknown="ignore"))
    ])

    full_pipeline = Pipeline([
        ("feature_adder", feature_adder),
        ("preprocessor", ColumnTransformer([
            ("cat", cat_pipeline, cat_attribs),
            ("num", "passthrough", num_attribs)
        ]))
    ])

    return full_pipeline


def fit_in_pipeline(df,full_pipeline,num_features,cat_attribs):
    X=full_pipeline.fit_transform(df)
    cat_encoder = full_pipeline.named_steps["preprocessor"].named_transformers_["cat"].named_steps["encoding"]
    cat_encoded_attribs = cat_encoder.get_feature_names_out(cat_attribs)
    num_encoded_attribs = list(cat_encoded_attribs)+num_features
    # print(num_encoded_attribs)
    # print(X)
    X_prepared=pd.DataFrame(X.toarray(),columns=num_encoded_attribs,index=df.index)
    return X_prepared

In [11]:
df=pd.read_csv("data/processed_Data.csv").dropna(how='any',axis=0).reset_index(drop=True)
label_attrib=["price"]
raw_attrib=["day"]
strata="year"
cat_attribs=["month","week_of_month"]
num_features=["years_since_start"]

In [12]:
 
MODEL_FILE = "model_files/model.pkl"
PIPELINE_FILE = "model_files/pipeline.pkl"
PREDICTION_FOLDER="predictions/"
INPUT_FILE=PREDICTION_FOLDER+"input.csv"
OUTPUT_FILE=PREDICTION_FOLDER+"output.csv"
os.makedirs(os.path.dirname(MODEL_FILE), exist_ok=True)
os.makedirs(os.path.dirname(PREDICTION_FOLDER), exist_ok=True)
full_pipeline=build_pipeline(cat_attribs, num_features)


In [13]:

strat_train_set,strat_test_set=stratifiedshufflesplitter(df.copy(),10,strata,0.2)

strat_test_set.copy().rename(
    columns={
        label_attrib[0]:"price"
    }
).to_csv(INPUT_FILE,index=False)



In [14]:
# # df.rename(columns={'old_name_A': 'new_name_A'})
# print(strat_train_set,full_pipeline,num_features,cat_attribs)
processed_data=fit_in_pipeline(strat_train_set,full_pipeline,num_features,cat_attribs).copy()
label_data=strat_train_set[label_attrib]
processed_data.head()


Unnamed: 0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,week_of_month_week1,week_of_month_week2,week_of_month_week3,week_of_month_week4,years_since_start
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
255,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
318,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [15]:
if not os.path.exists(MODEL_FILE):
    # TRAIN_START = time.perf_counter()
    print("Training RandomForest model...")

    model = LinearRegression(
        # n_estimators=100,
        # random_state=42,
        # n_jobs=-1
    )
    model.fit(processed_data, label_data)

    joblib.dump(model, MODEL_FILE)
    joblib.dump(full_pipeline, PIPELINE_FILE)

    # log_time("Model training & persistence", TRAIN_START)
    print("Model trained and saved.\n")



In [16]:

# =========================
# Inference
# =========================
model = joblib.load(MODEL_FILE)
pipeline = joblib.load(PIPELINE_FILE)

input_data = pd.read_csv(INPUT_FILE)

X_input = pipeline.transform(
    input_data.drop(label_attrib[0], axis=1)
)

predictions = model.predict(X_input)

# log_time("Inference (prediction)", INFER_START)

# Attach predictions
input_data["predicted_"+label_attrib[0]] = predictions

# Reorder columns for clarity
cols = [
    c for c in input_data.columns
    if c not in [label_attrib[0], "predicted_"+label_attrib[0]]
]
input_data = input_data[
    cols + [label_attrib[0], "predicted_"+label_attrib[0]]
]
# input_data = df.rename(columns={'price': 'predicted_price',})
input_data.to_csv(OUTPUT_FILE, index=False)

print("Inference complete. Results saved to output.csv")


# =========================
# Total Runtime
# =========================



Inference complete. Results saved to output.csv


