In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install dagshub mlflow

In [None]:
import dagshub
dagshub.init(repo_owner="gchit21",repo_name = "ML",mlflow=True)

In [None]:

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

# Dataset Info Gathering

Data information gathering

In [None]:
df = pd.read_table("/kaggle/input/house-prices-advanced-regression-techniques/train.csv",sep=",")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df["Alley"].isnull().sum()

In [None]:
df["PoolQC"].isnull().sum()

In [None]:
df["Fence"].isnull().sum()

In [None]:
df["MiscFeature"].isnull().sum()

Train split

In [None]:
from sklearn.model_selection import train_test_split

y = df["SalePrice"]
df.drop('SalePrice',axis=1,inplace=True)

X_train,X_test, y_train,y_test = train_test_split(df,y,test_size=0.2,random_state=42)

# Correlation function code

function for removing correlated columns

In [None]:
def corr_matrix(X, y, threshold=0.8):
    corr_matrix = X.corr()
    
    high_corr_pairs = []

    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

    # To remove one feature from each highly correlated pair
    features_to_drop = []
    for feat1, feat2, _ in high_corr_pairs:
        # Compare correlation with target and add the feature with lower correlation to target to the drop list
        if abs(X[feat1].corr(y)) < abs(X[feat2].corr(y)):
            features_to_drop.append(feat1)
        else:
            features_to_drop.append(feat2)

    # Remove duplicates
    features_to_drop = list(set(features_to_drop))

    return features_to_drop,high_corr_pairs 


# data cleaning

Cleaning and feature engineering

In [None]:
for col in X_train.columns:
    if df[col].dtype == "object":
        mode_val = df[col].mode()[0]
        X_train.fillna({col:mode_val}, inplace=True)
        X_test.fillna({col:mode_val},inplace=True)
        
    elif df[col].dtype in ["int64", "float64"]:
        mean_val = df[col].mean()
        X_train.fillna({col:mean_val}, inplace=True)
        X_test.fillna({col:mean_val},inplace=True)
        

In [None]:
pd.set_option('display.max_rows', 10)
X_train.isnull().sum()

dropping unnecessary columns (they have more than 95% Na or give no additional information)

In [None]:
#dropping unnecessary columns from train set 
X_train.drop(['Alley','PoolQC'],axis=1,inplace=True)
X_train.drop(["Fence","MiscFeature"],axis=1,inplace=True)
X_train.drop(["MasVnrType","MasVnrArea"],axis=1,inplace=True)
X_train.drop(["FireplaceQu","Fireplaces"],axis=1,inplace=True)
X_train.drop("Id",axis=1,inplace=True)
X_train.drop("MiscVal",axis=1,inplace=True)
X_train.drop("LowQualFinSF",axis=1,inplace=True)
X_train.drop("PoolArea",axis=1,inplace=True)
X_train.drop("3SsnPorch",axis=1,inplace=True)

# dropping unnecessary columns from test set(derived from train)
X_test.drop(['Alley','PoolQC'],axis=1,inplace=True)
X_test.drop(["Fence","MiscFeature"],axis=1,inplace=True)
X_test.drop(["MasVnrType","MasVnrArea"],axis=1,inplace=True)
X_test.drop(["FireplaceQu","Fireplaces"],axis=1,inplace=True)
X_test.drop("Id",axis=1,inplace=True)
X_test.drop("MiscVal",axis=1,inplace=True)
X_test.drop("LowQualFinSF",axis=1,inplace=True)
X_test.drop("PoolArea",axis=1,inplace=True)
X_test.drop("3SsnPorch",axis=1,inplace=True)


**get categorial columns**

get Categorial columns

In [None]:
Cat_columns_less3 = [col for col in X_train.columns if df[col].dtype == 'object' and X_train[col].nunique()<=3]
Cat_columns_more3 = [col for col in X_train.columns if df[col].dtype == 'object'and X_train[col].nunique()>3]

In [None]:
len(Cat_columns_less3)

In [None]:
len(Cat_columns_more3)

**One-Hot Encoding and Woe encoding**

In [None]:
from category_encoders.woe import WOEEncoder

In [None]:
#make bins out of y so that woe is possible
median = y_train.median()
y_train_cp = (y_train>=median).astype(int).copy()

WOE encoder

In [None]:
woe_encoder = WOEEncoder(cols=Cat_columns_more3)

X_train[Cat_columns_more3] = woe_encoder.fit_transform(X_train[Cat_columns_more3],y_train_cp)
X_test[Cat_columns_more3] = woe_encoder.transform(X_test[Cat_columns_more3])


train_encoded = pd.get_dummies(X_train,columns=Cat_columns_less3,drop_first=True,dtype=int)
test_encoded = pd.get_dummies(X_test,columns=Cat_columns_less3,drop_first=True,dtype=int)


In [None]:
#dummies მეთოდმა შექმნა ეს სვეტი, 0-ებით სავსეა და უბრალოდ გადავაგდე, არაფრისმომცემია...
train_encoded.drop("Utilities_NoSeWa",axis=1,inplace=True)

In [None]:
train_encoded.corr().abs()

In [None]:
features,high =corr_matrix(train_encoded,y_train,threshold=0.7)
train_encoded_cp = train_encoded.drop(features,axis=1)
test_encoded_cp = test_encoded.drop(features,axis=1)


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
scalerMinMax = MinMaxScaler()

In [None]:
train_scaled = pd.DataFrame(scaler.fit_transform(train_encoded_cp),columns=train_encoded_cp.columns)
test_scaled=pd.DataFrame(scaler.transform(test_encoded_cp),columns=test_encoded_cp.columns)


# Training and Mlflow logging

In [None]:
import mlflow
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,mean_absolute_percentage_error
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_selection import RFE

**Linear Model**

Training and Mlflow Logging

In [None]:
experiment_name = "LinearRegression"
mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name="Linear_Regression"):

    #model
    linModel = LinearRegression()
    
    #RFE
    rfeLin = RFE(estimator=linModel, n_features_to_select=15, step=2)
    train_rfe=rfeLin.fit_transform(train_scaled, y_train)
    test_rfe=rfeLin.transform(test_scaled)
    rfe_selected_features = train_scaled.columns[rfeLin.support_].tolist()
    
    
    #model fit
    linModel.fit(train_rfe,y_train)
    y_pred = linModel.predict(test_rfe)

    #metrics
    r2 = r2_score(y_test, y_pred)
    rmsle = np.sqrt(mean_squared_log_error(y_test, np.abs(y_pred)))
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape=mean_absolute_percentage_error(y_test,y_pred)

    #Mlflow logs
    mlflow.log_param("model_type", "Linear_Regression")
    mlflow.log_param("scaling_method", "StandardScaler")
    mlflow.log_param("correlation_threshold",0.7)
    mlflow.log_param("selected_features_n",15)
    mlflow.log_param("rfe_step",2)
    mlflow.log_param("selected_features",rfe_selected_features)
    mlflow.log_metric("R2",r2)
    mlflow.log_metric("RMSLE",rmsle)
    mlflow.log_metric("RMSE",rmse)
    mlflow.log_metric("MAPE",mape)
    #Mlflow log model
    mlflow.sklearn.log_model(linModel, artifact_path="model", registered_model_name="house_price_best_model")

   

**Lasso Model**

In [None]:
from sklearn.linear_model import Lasso
alphas = [10, 5, 1]
experiment_name = "LinearRegression"
mlflow.set_experiment(experiment_name)
for alpha in alphas:
    with mlflow.start_run(run_name=f"Lasso_alpha_{alpha}"):
        
        #model
        lasModel = Lasso(alpha=alpha, random_state=42, max_iter=10000)
        
        #RFE
        rfeLas = RFE(estimator=lasModel, n_features_to_select=15, step=1)
        train_rfe=rfeLas.fit_transform(train_scaled, y_train)
        test_rfe=rfeLas.transform(test_scaled)
        rfe_selected_features = train_scaled.columns[rfeLas.support_].tolist()

        #model fit and predict
        lasModel.fit(train_rfe,y_train)
        y_pred = lasModel.predict(test_rfe)

        #metrics
        r2 = r2_score(y_test, y_pred)
        rmsle = np.sqrt(mean_squared_log_error(y_test, np.abs(y_pred)))
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape=mean_absolute_percentage_error(y_test,y_pred)

        #Mlflow logs
        mlflow.log_param("model_type", "Lasso")
        mlflow.log_param("max_iter", 1000)
        mlflow.log_param("random_state", 42)
        mlflow.log_param("scaling_method", "StandardScaler")
        mlflow.log_param("alpha",alpha)
        mlflow.log_param("correlation_threshold",0.65)
        mlflow.log_param("Selected_features_n",15)
        mlflow.log_param("Selected_features",rfe_selected_features)
        mlflow.log_metric("R2",r2)
        mlflow.log_metric("RMSLE",rmsle)
        mlflow.log_metric("RMSE",rmse)
        mlflow.log_metric("MAPE",mape)


       

    

**Ridge Model**

In [None]:
from sklearn.linear_model import Ridge
alphas = [0.1, 0.05, 0.01]
experiment_name = "LinearRegression"
mlflow.set_experiment(experiment_name)
for alpha in alphas:
    with mlflow.start_run(run_name=f"Ridge_alpha_{alpha}"):

        #model
        ridgeModel = Ridge(alpha=alpha, random_state=42, max_iter=10000)
        
        #RFE
        rfeRidge = RFE(estimator=ridgeModel, n_features_to_select=15, step=1)
        train_rfe=rfeRidge.fit_transform(train_scaled, y_train)
        test_rfe=rfeRidge.transform(test_scaled)
        rfe_selected_features = train_scaled.columns[rfeRidge.support_].tolist()

        #model fit and predict
        ridgeModel.fit(train_rfe,y_train)
        y_pred = ridgeModel.predict(test_rfe)

        #metrics
        r2 = r2_score(y_test, y_pred)
        rmsle = np.sqrt(mean_squared_log_error(y_test, np.abs(y_pred)))
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape=mean_absolute_percentage_error(y_test,y_pred)
        

        #Mlflow Logs 
        mlflow.log_param("model_type", "Ridge")
        mlflow.log_param("max_iter", 1000)
        mlflow.log_param("random_state", 42)
        mlflow.log_param("scaling_method", "StandardScaler")
        mlflow.log_param("alpha",alpha)
        mlflow.log_param("correlation_threshold",0.65)
        mlflow.log_param("selected_features_n",15)
        mlflow.log_param("Selected_features",rfe_selected_features)
        mlflow.log_metric("R2",r2)
        mlflow.log_metric("RMSLE",rmsle)
        mlflow.log_metric("RMSE",rmse)
        mlflow.log_metric("MAPE",mape)