In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
best_parameters = {'feature_selector__n_features': 10, 
                   'regressor__max_depth': 20, 
                   'regressor__min_samples_leaf': 10, 
                   'regressor__min_samples_split': 20, 
                   'scaler': None}

In [3]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
!pip install dagshub mlflow
import dagshub
import mlflow
dagshub.init(repo_owner='dimna21', repo_name='ML_Assignment1', mlflow=True)

# Pipeline methods

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cat_cols = []
        self.num_cols = []
    
    def fit(self, X, y=None):
        self.cat_cols = [col for col in X.columns if X[col].dtype == 'object']
        self.num_cols = [col for col in X.columns if X[col].dtype != 'object']
        return self
    
    def transform(self, X):
        X = X.copy()        
        for col in self.cat_cols:
            X[col] = X[col].fillna(f'No{col}')
        
        for col in self.num_cols:
            X[col] = X[col].fillna(0)
        
        return X

In [6]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, one_hot_columns=None):
        self.one_hot_columns = one_hot_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        
        # Perform one-hot encoding using pd.get_dummies
        encoded = pd.get_dummies(X[self.one_hot_columns], drop_first=True)
        
        # Drop the original columns that were encoded
        X = X.drop(columns=self.one_hot_columns)
        
        # Concatenate the new encoded columns
        X = pd.concat([X, encoded], axis=1)
        
        # Convert boolean columns to integers (0 and 1)
        X[encoded.columns] = X[encoded.columns].astype(int)
        
        return X

In [7]:
class CorrelationFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.features_to_drop = []

    def fit(self, X, y):
        corr_matrix = X.corr().abs()
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        high_corr_pairs = []

        for i in range(len(corr_matrix.columns)):
            for j in range(i + 1, len(corr_matrix.columns)):
                if corr_matrix.iloc[i, j] > self.threshold:
                    high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
        
        features_to_drop = []
        for feat1, feat2, _ in high_corr_pairs:
            if abs(X[feat1].corr(y)) < abs(X[feat2].corr(y)):
                features_to_drop.append(feat1)
            else:
                features_to_drop.append(feat2)
        
        self.features_to_drop = list(set(features_to_drop))
        return self

    def transform(self, X):
        return X.drop(columns=self.features_to_drop)

In [8]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor

class FeatureSelectorRFE(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=30, estimator=None):
        self.n_features = n_features
        self.estimator = estimator if estimator is not None else DecisionTreeRegressor()
        self.selected_features = None
        
    def fit(self, X, y):
        if self.selected_features is None:
            rfe = RFE(self.estimator, n_features_to_select=self.n_features)
            rfe.fit(X, y)
            self.selected_features = X.columns[rfe.support_].tolist()
        return self

    def transform(self, X):
        return X[self.selected_features]

# Fitting the best model to test set

In [9]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
y_train = train_df['SalePrice']
X_train = train_df.drop(columns=['SalePrice', 'Id'])
X_test = df.drop(columns=['Id'])

cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

In [10]:
#Preprocessing
cleaner = DataCleaner()
encoder = CategoricalEncoder(one_hot_columns=cat_cols)
corr_filter = CorrelationFilter()
feature_selector = FeatureSelectorRFE(n_features=best_parameters['feature_selector__n_features'])

X_train_clean = cleaner.fit_transform(X_train)
X_train_encoded = encoder.fit_transform(X_train_clean)
X_train_corr_filtered = corr_filter.fit_transform(X_train_encoded, y_train)
X_train_selected = feature_selector.fit_transform(X_train_corr_filtered, y_train)

X_test_clean = cleaner.transform(X_test)
X_test_encoded = encoder.transform(X_test_clean)
X_test_corr_filtered = X_test_encoded.drop(columns=corr_filter.features_to_drop)
X_test_selected = X_test_corr_filtered[feature_selector.selected_features]

# Fitting the best model to test set

In [11]:
from sklearn.pipeline import Pipeline
import mlflow.sklearn
#get the model from logged pipeline
logged_model = 'runs:/7702289a9828469bad55790437f20e5b/best_model'
pipeline = mlflow.sklearn.load_model(logged_model)

# Extract the regressor step
regressor = pipeline.named_steps['regressor']

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
preds = regressor.predict(X_test_selected)

submission = pd.DataFrame({
    'Id': df['Id'],
    'SalePrice': preds
})

submission.to_csv('submission_file.csv', index=False)