In [None]:
# Import Data Manipulation Libraries
import numpy as np 
import pandas as pd 

# Import Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt 

# Import Filter Warning Libraries
import warnings
warnings.filterwarnings(action = 'ignore')

# Import Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder,OneHotEncoder,RobustScaler
from sklearn.decomposition import PCA

# Import Metrics for Regression and Classification
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Import Pipelines and Column Transformers
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Import OrderedDict for maintaining the order of columns in ColumnTransformer
from collections import OrderedDict

# Import scipy for statistical tests
from scipy import stats, statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Import Logging
import logging
logging.basicConfig(level = logging.INFO,
                    format = '%(asctime)s - %(levelname)s - %(message)s',
                    filemode = 'w',
                    filename = 'model.log',force = True)

# Import Machine Learning Models
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import xgboost
from xgboost import XGBRegressor
from sklearn.cluster import KMeans


: 

In [None]:
# Uploading Dataset Using Pandas Function
# Url Taken from Github 
url = '  '
df = pd.read_csv(url)

df.sample(frac = 1)  # Shuffle Dataset

In [None]:
# Checking Dataset Information
df.info()

In [None]:
from collections import OrderedDict

stats = []

# Descriptive statistics
for col in df.columns:
    if df[col].dtype != 'object':
        numerical_stats = OrderedDict({
            'Feature': col,
            'Minimum': df[col].min(),
            'Maximum': df[col].max(),
            'Mean': df[col].mean(),
            'Mode': df[col].mode()[0] if not df[col].mode().empty else None,
            '25%': df[col].quantile(0.25),
            '75%': df[col].quantile(0.75),
            'IQR': df[col].quantile(0.75) - df[col].quantile(0.25),
            'Standard Deviation': df[col].std(),
            'Skewness': df[col].skew(),
            'Kurtosis': df[col].kurt()
        })
        stats.append(numerical_stats)

# Convert to DataFrame
report = pd.DataFrame(stats)

# Outlier Identification :
outlier_label = []
for col in report['Feature']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    LW = Q1 - 1.5 * IQR   # LW : Lower Whisker Line
    UW = Q3 + 1.5 * IQR   # UW : Upper Whisker Line
    outliers = df[(df[col] < LW) | (df[col] > UW)]
    if not outliers.empty:
        outlier_label.append("Has Outliers")
    else:
        outlier_label.append("No Outliers")

report["Outlier Comment"] = outlier_label

# Checking Report
report

In [None]:
# Checking BoxenPlot 
plt.figure(figsize=(10, 10))
sns.boxenplot(data= df)

plt.xticks(rotation = 90)  
plt.title("Boxen Plot of Numerical Features")
plt.tight_layout()
plt.show()

In [None]:
# Replace Outliers with Median Statergy

for col in df.select_dtypes(include='number').columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
    outlier_count = outliers.sum()

    if outlier_count > 0:
        replacement = df[col].median()  
        df.loc[outliers, col] = replacement
        print(f"Replaced {outlier_count} outliers in '{col}' with median.")
    else:
        print(f"No outliers found in '{col}'.")

In [None]:
# Set figure size
plt.figure(figsize=(15, 8))

# Create boxplot for all numerical columns
sns.boxplot(data=df, orient='h', palette='Set2')

# Set title
plt.title('Boxplot After Outlier Treatment')
plt.tight_layout()
plt.show()

In [None]:
# Checking Dataset Columns
df.columns

In [None]:
target = ' '

# Checking VIF:
def calculate_vif(dataset):
    vif = pd.DataFrame()
    vif['features'] = dataset.columns
    vif['VIF_Values'] = [variance_inflation_factor(dataset.values,i) for i in range(dataset.shape[1])]
    vif['VIF_Values'] = round(vif['VIF_Values'], 2)
    vif = vif.sort_values(by = 'VIF_Values', ascending=False)
    return (vif)

calculate_vif(df.drop('target',axis = 1))

In [None]:
# Step 1: Standardize the data

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df)

# Step 2: Determine number of components to retain 90% variance

for i in range(1, df.shape[1] + 1):
    pca = PCA(n_components=i)
    pca.fit(X_scaled)
    evr = np.cumsum(pca.explained_variance_ratio_)
    if evr[i - 1] >= 0.90:
        pcs = i
        break

print("Explained Variance Ratio:", evr)
print("Number of components selected:", pcs)

# Step 3: Apply PCA

pca = PCA(n_components=pcs)
pca_data = pca.fit_transform(X_scaled)

# Step 4: Create DataFrame

pca_columns = [f'PC{j+1}' for j in range(pcs)]
pca_df = pd.DataFrame(pca_data, columns=pca_columns)

# Step 5: Join Target Column with PCA:

pca_df = pca_df.join(df['target'], how = 'left')

pca_df.head()

In [None]:
def train_and_test_split(data, tcol, testSize=0.3, randomState=3):
    X = data.drop(tcol,axis=1)
    y = data[tcol]
    return train_test_split(X,y,test_size = testSize,random_state=randomState)

def model_builder(model_name, model, data, t_col):
    X_train,X_test,y_train,y_test = train_and_test_split(data,t_col)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test,y_pred))
    result = [model_name, rmse, r2]
    return result

model_builder(model_name='LinearRegression',model=LinearRegression(),data=pca_df,t_col='target')

In [None]:
def multiple_models(data, t_col):
    col_names=['Model Name','rmse','r2_score']
    result = pd.DataFrame(columns = col_names)
    result.loc[len(result)] = model_builder('Linear Regression',LinearRegression(),data,t_col)
    result.loc[len(result)] = model_builder('Lasso',Lasso(),data,t_col)
    result.loc[len(result)] = model_builder('Ridge',Ridge(),data,t_col)
    result.loc[len(result)] = model_builder('Decision Tree',DecisionTreeRegressor(),data,t_col)
    result.loc[len(result)] = model_builder('SVR',SVR(),data,t_col)
    result.loc[len(result)] = model_builder('KNN',KNeighborsRegressor(),data,t_col)
    result.loc[len(result)] = model_builder('Random Forest',RandomForestRegressor(),data,t_col)
    result.loc[len(result)] = model_builder('Gradient Boost',GradientBoostingRegressor(),data,t_col)
    result.loc[len(result)] = model_builder('ADA Boost',AdaBoostRegressor(),data,t_col)
    result.loc[len(result)] = model_builder('XG Boost',XGBRegressor(),data,t_col)
    return result.sort_values(by = 'r2_score', ascending=False)

multiple_models(pca_df, 'target')

In [None]:
def k_fold_cv(X, y , fold= 10):
    score_LR = cross_val_score(LinearRegression(), X, y ,cv = fold)
    score_LS = cross_val_score(Lasso(), X, y, cv = fold)
    score_RD = cross_val_score(Ridge(), X, y, cv = fold)
    score_DTR = cross_val_score(DecisionTreeRegressor(), X, y, cv = fold)
    score_SVR = cross_val_score(SVR(), X, y ,cv = fold)
    score_KNN = cross_val_score(KNeighborsRegressor(), X, y ,cv = fold)
    score_RF = cross_val_score(RandomForestRegressor(), X, y ,cv = fold)
    score_GB = cross_val_score(GradientBoostingRegressor(), X, y, cv = fold)
    score_ADA = cross_val_score(AdaBoostRegressor(), X, y, cv = fold)
    score_XG = cross_val_score(XGBRegressor(), X, y, cv = fold)
    
    
    
    model_name = ['Linear Regression','Lasso','Ridge','DTR','SVR','KNN','Random Forest','Gradient Boost','ADA Boost','XG' ]
    scores = [score_LR,score_LS,score_RD,score_DTR,score_SVR,score_KNN,score_RF,score_GB,score_ADA,score_XG]
    result = []
    for i in range(len(model_name)):
        score_mean = np.mean(scores[i])
        score_std = np.std(scores[i])
        m_name = model_name[i]
        temp = [m_name,score_mean,score_std]
        result.append(temp)
    k_fold_df = pd.DataFrame(result,columns = ['Model Name','CV Accuracy','CV STD'])
    return k_fold_df.sort_values('CV Accuracy',ascending= False)


k_fold_cv(pca_df.drop('target',axis=1), pca_df['target'])

In [None]:
def tuning(X, y , fold = 10):
    #Generally we dont do hyperparameter tuning for all the models, becuase if we train lasso and ridge at max it will give.
    #Parameters Grid for tuning
    param_LAS = {'alpha' : [1e-15, 1e-13, 1e-11, 1e-9,1e-7, 1e-5,1e-3, 1e-1, 0,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,200,300,400,500]}
    param_KNN = {'n_neighbors' : [1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]}
    param_DTR = {'max_depth' : [3,5,7,9,10,12,14,16] , 'max_features' : ['auto', 'log2', 'sqrt', 2,3,4,5,6]}
    param_SVR = {'gamma' : ['scale' , 'auto'], 'C' : [0.5 , 1]}
    param_ADB = {'learning_rate' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}
    param_GB = {'alpha' : [0.1,0.3,0.5,0.9]}
    param_XGB = {'eta' : [0.1,0.2,0.3,0.4,0.5], 'max_depth' : [3,5,7,9,10,12,14,15,16], 'gamma' : [0,10,20,30,40,50,60,70,80,90,100,200,300,400,500], 'reg_lambda' : [0,1]}
    param_RF = {'max_depth' : [3,5,7,8,9,10,12,14,16], 'max_features' : ['auto', 'log2', 'sqrt', 2,3,4,5,6]}
#hyperparameter Tuning
    tune_LAS = GridSearchCV(Lasso(), param_LAS, cv = fold)
    tune_RID = GridSearchCV(Ridge(), param_LAS , cv = fold)
    tune_KNN = GridSearchCV(KNeighborsRegressor() , param_KNN , cv = fold)
    tune_DT = GridSearchCV(DecisionTreeRegressor(), param_DTR , cv = fold)
    tune_SVR = GridSearchCV(SVR(), param_SVR , cv = fold)
    tune_ADB = GridSearchCV(AdaBoostRegressor() , param_ADB , cv = fold)
    tune_GB = GridSearchCV(GradientBoostingRegressor() , param_GB , cv = fold)
    tune_XGB = GridSearchCV(XGBRegressor() , param_XGB , cv = fold)
    tune_RF = GridSearchCV(RandomForestRegressor() , param_RF , cv = fold)
    
    #Fitting X and Y
    tune_LAS.fit(X,y)
    tune_RID.fit(X,y)
    tune_KNN.fit(X,y)
    tune_DT.fit(X,y)
    tune_SVR.fit(X,y)
    tune_ADB.fit(X,y)
    tune_GB.fit(X,y)
    tune_XGB.fit(X,y)
    tune_RF.fit(X,y)
    
    tune = [tune_LAS, tune_RID, tune_KNN, tune_DT, tune_SVR, tune_ADB, tune_GB, tune_XGB, tune_RF]
    models = ['Lasso', 'Ridge', 'KNN' , 'DTR', 'SVR' , 'ADBR', 'GBR', 'XGBR' , 'RFR']
    
    for i in range(len(tune)):
        print('models:', models[i])
        print('best parameters :', tune[i].best_params_)

    
tuning(pca_df.drop('target',axis=1), pca_df['target'])

In [None]:
def cv_post_hpt(X,y, fold = 5):
    score_LR = cross_val_score(LinearRegression(), X, y, cv = fold)
    score_LS = cross_val_score(Lasso(alpha = 0.1), X, y ,cv = fold)
    score_RD = cross_val_score(Ridge(alpha = 6 ), X , y , cv = fold)
    score_DTR = cross_val_score(DecisionTreeRegressor(max_depth = 16), X , y , cv = fold)
    score_SVR= cross_val_score(SVR(C = 1), X , y , cv = fold)
    score_RandomForest = cross_val_score(RandomForestRegressor(max_depth = 14, max_features = 4), X , y , cv = fold)
    score_KNN = cross_val_score(KNeighborsRegressor(n_neighbors = 4), X , y , cv = fold)
    score_GBoost = cross_val_score(GradientBoostingRegressor(alpha = 0.9), X , y , cv = fold)
    score_XGBoost = cross_val_score(XGBRegressor(eta = 0.2,max_depth = 5, reg_lambda = 0, gamma = 0), X , y , cv = fold)
    score_AdaBoost = cross_val_score(AdaBoostRegressor(learning_rate = 1), X , y , cv = fold)
    
    
    model_name = ['Linear Regression', 'Lasso', 'Ridge', 'DTR' , 'SVR' , 'Random Forest' , 'KNN', 'Gboost' , 'XGBoost' , 'AdaBoost']
    scores = [score_LR, score_LS, score_RD, score_DTR, score_SVR,score_RandomForest, score_KNN, score_GBoost, score_XGBoost, score_AdaBoost ]
    result = []
    for i in range(len(model_name)):
        score_mean = np.mean(scores[i])
        score_std = np.std(scores[i])
        m_name = model_name[i]
        temp = [m_name , score_mean , score_std]
        result.append(temp)
    k_fold_df = pd.DataFrame(result , columns = ['Model Name' , 'CV Accuracy' , 'CV STD'])
    return k_fold_df.sort_values('CV Accuracy', ascending = False)


cv_post_hpt(pca_df.drop('target',axis=1), pca_df['target'])

In [None]:
# Plotting Learning Rate 
# List of models
models = {
    'Lasso': Lasso(alpha=1),
    'Ridge': Ridge(alpha=1),
    'KNN': KNeighborsRegressor(n_neighbors=5),
    'DTR': DecisionTreeRegressor(max_depth=5),
    'SVR': SVR(C=1, gamma='scale'),
    'ADBR': AdaBoostRegressor(learning_rate=1),
    'GBR': GradientBoostingRegressor(alpha=0.1),
    'XGBR': XGBRegressor(eta=0.1, max_depth=3, verbosity=0),
    'RFR': RandomForestRegressor(max_depth=5, max_features='sqrt')
}

X = pca_df.drop('target', axis=1)
y = pca_df['target']

# Plot learning curves
plt.figure(figsize=(18, 20))
for i, (name, model) in enumerate(models.items()):
    plt.subplot(5, 2, i+1)
    
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 5),
        scoring='r2'
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.plot(train_sizes, train_scores_mean, 'o-', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', label='Cross-validation score')
    plt.title(f'Learning Curve: {name}')
    plt.xlabel('Training Set Size')
    plt.ylabel('RÂ² Score')
    plt.legend(loc='best')
    plt.grid()

plt.tight_layout()
plt.show()