In [403]:
import pandas as pd
import numpy as np

# =========================
#  Utility Functions
# =========================

def handle_missing_mean(df):
    for col in df.columns:
        if df[col].dtype != 'object':
            df[col] = df[col].fillna(df[col].mean())
    return df


def min_max_scale(df):
    scaled_df = df.copy()
    for col in scaled_df.columns:
        if scaled_df[col].dtype != 'object':
            min_val = scaled_df[col].min()
            max_val = scaled_df[col].max()
            scaled_df[col] = (scaled_df[col] - min_val) / (max_val - min_val)
    return scaled_df

def remove_outliers_iqr(df):
    df_clean = df.copy()
    for col in df_clean.columns:
        if df_clean[col].dtype != 'object':
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    return df_clean

def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)

# Gradient Descent Implementation
def gradient_descent(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)
    for _ in range(epochs):
        preds = X.dot(theta)
        error = preds - y
        theta -= lr * (1/m) * X.T.dot(error)
    return theta

def predict(X, theta):
    return X.dot(theta)

# =========================
#  HOUSING DATASET
# =========================
housing = pd.read_csv(r"C:\Users\LENOVO\Downloads\Housing.csv") 

# Drop categorical variables for simple regression
housing = housing.select_dtypes(include=[np.number])

housing = handle_missing_mean(housing)
housing_scaled = min_max_scale(housing)

# Split features & target
X = housing_scaled.drop(columns=["price"]).values
y = housing_scaled["price"].values
X = np.c_[np.ones(X.shape[0]), X]  # bias term

# Train before removing outliers
theta_before = gradient_descent(X, y, lr=0.1, epochs=2000)
y_pred_before = predict(X, theta_before)

print("Before Outlier Removal -> MSE:", mse(y, y_pred_before), "R²:", r2_score(y, y_pred_before))

# Remove outliers
housing_no_outliers = remove_outliers_iqr(housing_scaled)

# Re-split
X_no = housing_no_outliers.drop(columns=["price"]).values
y_no = housing_no_outliers["price"].values
X_no = np.c_[np.ones(X_no.shape[0]), X_no]

theta_after = gradient_descent(X_no, y_no, lr=0.1, epochs=2000)
y_pred_after = predict(X_no, theta_after)

print("After Outlier Removal  -> MSE:", mse(y_no, y_pred_after), "R²:", r2_score(y_no, y_pred_after))


Before Outlier Removal -> MSE: 0.011478378238310955 R²: 0.5615150232988959
After Outlier Removal  -> MSE: 0.00740010503628017 R²: 0.3635936809174899


In [405]:
import pandas as pd
import numpy as np

# ------------------------
# 1. Load the dataset
# ------------------------
df = pd.read_csv(r"C:\Users\LENOVO\Downloads\advertising.csv")  # Ensure CSV has 'TV', 'Radio', 'Newspaper', 'Sales'

print("First 5 rows:\n", df.head())
print("\nDataset Info:")
print(df.info())

# ------------------------
# 2. Handle missing values (replace with mean)
# ------------------------
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mean(), inplace=True)

# ------------------------
# 3. Min-Max Normalization (0-1 scaling)
# ------------------------
def min_max_scale(col):
    return (col - col.min()) / (col.max() - col.min())

df_scaled = df.copy()
for col in df_scaled.columns[:-1]:  # Exclude target column 'Sales'
    df_scaled[col] = min_max_scale(df_scaled[col])

# ------------------------
# Helper functions for Linear Regression from scratch
# ------------------------
def train_test_split(X, y, test_size=0.2):
    idx = np.arange(len(X))
    np.random.shuffle(idx)
    split = int(len(X) * (1 - test_size))
    return X[idx[:split]], X[idx[split:]], y[idx[:split]], y[idx[split:]]

def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def r2_score(y_true, y_pred):
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_res = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_res / ss_total)

def gradient_descent(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    weights = np.zeros(n)
    bias = 0

    for _ in range(epochs):
        y_pred = np.dot(X, weights) + bias
        dw = (-2/m) * np.dot(X.T, (y - y_pred))
        db = (-2/m) * np.sum(y - y_pred)

        weights -= lr * dw
        bias -= lr * db

    return weights, bias

def predict(X, weights, bias):
    return np.dot(X, weights) + bias

# ------------------------
# 4. Train before outlier removal
# ------------------------
X = df_scaled[['TV', 'Radio', 'Newspaper']].values
y = df_scaled['Sales'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

weights, bias = gradient_descent(X_train, y_train, lr=0.01, epochs=5000)
y_pred = predict(X_test, weights, bias)

print("\nBefore Outlier Removal -> MSE:", mse(y_test, y_pred), "R²:", r2_score(y_test, y_pred))

# ------------------------
# 5. Detect and Remove Outliers using IQR
# ------------------------
def remove_outliers(df):
    df_clean = df.copy()
    for col in df_clean.columns[:-1]:  # Exclude target
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    return df_clean

df_no_outliers = remove_outliers(df_scaled)

# ------------------------
# 6. Train after outlier removal
# ------------------------
X = df_no_outliers[['TV', 'Radio', 'Newspaper']].values
y = df_no_outliers['Sales'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

weights, bias = gradient_descent(X_train, y_train, lr=0.01, epochs=5000)
y_pred = predict(X_test, weights, bias)

print("After Outlier Removal  -> MSE:", mse(y_test, y_pred), "R²:", r2_score(y_test, y_pred))


First 5 rows:
       TV  Radio  Newspaper  Sales
0  230.1   37.8       69.2   22.1
1   44.5   39.3       45.1   10.4
2   17.2   45.9       69.3   12.0
3  151.5   41.3       58.5   16.5
4  180.8   10.8       58.4   17.9

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB
None

Before Outlier Removal -> MSE: 2.552028658496755 R²: 0.9114486959562746
After Outlier Removal  -> MSE: 2.549316884744545 R²: 0.9219527784297882
