In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

def least_square_beta(X, y):
    X = np.insert(X, 0, 1, axis=1)
    beta = np.linalg.inv(X.T @ X) @ X.T @ y
    return beta

df = pd.read_csv('/content/USA_Housing.csv')

X = df.drop('Price', axis=1).values
y = df['Price'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores = []
betas = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    beta = least_square_beta(X_train, y_train)
    betas.append(beta)
    y_pred = np.insert(X_test, 0, 1, axis=1) @ beta
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    print(f"R2 Score: {r2}")

best_beta = betas[np.argmax(r2_scores)]
print(f"\nBest Beta: {best_beta}")

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
y_pred_final = np.insert(X_test_final, 0, 1, axis=1) @ best_beta
final_r2 = r2_score(y_test_final, y_pred_final)
print(f"Final R2 Score on 30% test data with best beta: {final_r2}")


R2 Score: 0.9179971706985147
R2 Score: 0.9145677884802818
R2 Score: 0.9116116385364478
R2 Score: 0.9193091764960816
R2 Score: 0.9243869413350316

Best Beta: [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]
Final R2 Score on 30% test data with best beta: 0.9147458156636434


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score


def gradient_descent(X, y, learning_rate, iterations):
    m, n = X.shape
    X_b = np.insert(X, 0, 1, axis=1)  # Add intercept term
    beta = np.zeros(n + 1)

    for _ in range(iterations):
        gradients = (2/m) * X_b.T @ (X_b @ beta - y)
        beta -= learning_rate * gradients

    return beta

df = pd.read_csv('/content/USA_Housing.csv')


X = df.drop('Price', axis=1).values
y = df['Price'].values


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.68, random_state=42)

learning_rates = [0.001, 0.01, 0.1, 1]
results = []

for lr in learning_rates:
    print(f"\nTraining with learning rate: {lr}")
    beta = gradient_descent(X_train, y_train, lr, 1000)

    X_val_b = np.insert(X_val, 0, 1, axis=1)
    y_pred_val = X_val_b @ beta
    r2_val = r2_score(y_val, y_pred_val)

    X_test_b = np.insert(X_test, 0, 1, axis=1)
    y_pred_test = X_test_b @ beta
    r2_test = r2_score(y_test, y_pred_test)

    results.append({'learning_rate': lr, 'beta': beta, 'R2_val': r2_val, 'R2_test': r2_test})
    print(f"R2 Score on Validation Set: {r2_val}")
    print(f"R2 Score on Test Set: {r2_test}")

best_result = max(results, key=lambda x: x['R2_val'])
print("\nBest Regression Coefficients based on Validation Set R2 Score:")
print(f"Learning Rate: {best_result['learning_rate']}")
print(f"Coefficients (Beta): {best_result['beta']}")
print(f"R2 Score on Validation Set: {best_result['R2_val']}")
print(f"R2 Score on Test Set: {best_result['R2_test']}")


Training with learning rate: 0.001
R2 Score on Validation Set: 0.6467117844424869
R2 Score on Test Set: 0.6531360260800088

Training with learning rate: 0.01
R2 Score on Validation Set: 0.9202206893493433
R2 Score on Test Set: 0.9133419052066929

Training with learning rate: 0.1
R2 Score on Validation Set: 0.9202207766800662
R2 Score on Test Set: 0.9133419747998835

Training with learning rate: 1
R2 Score on Validation Set: -inf
R2 Score on Test Set: -inf

Best Regression Coefficients based on Validation Set R2 Score:
Learning Rate: 0.1
Coefficients (Beta): [1232180.27200919  230645.88389435  165328.94019375  120045.00851908
    2945.02108903  151375.22971285]
R2 Score on Validation Set: 0.9202207766800662
R2 Score on Test Set: 0.9133419747998835


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

column_names = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
                "num_doors", "body_style", "drive_wheels", "engine_location",
                "wheel_base", "length", "width", "height", "curb_weight",
                "engine_type", "num_cylinders", "engine_size", "fuel_system",
                "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
                "city_mpg", "highway_mpg", "price"]

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data',
                 names=column_names, na_values='?')

for col in ['normalized_losses', 'horsepower', 'peak_rpm', 'price', 'bore', 'stroke']:
    imputer = SimpleImputer(strategy='mean')
    df[col] = imputer.fit_transform(df[[col]])

df.dropna(subset=['price'], inplace=True)

door_map = {'two': 2, 'four': 4}
df['num_doors'] = df['num_doors'].map(door_map).fillna(4)

cylinder_map = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}
df['num_cylinders'] = df['num_cylinders'].map(cylinder_map)

df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'], drop_first=True)

label_encoder = LabelEncoder()
for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    df[col] = label_encoder.fit_transform(df[col])

df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x) else 0)
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in str(x) else 0)

X = df.drop('price', axis=1)
y = df['price']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)
r2_before_pca = r2_score(y_test, y_pred)
print(f"R2 Score before PCA: {r2_before_pca}")

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

linear_reg_pca = LinearRegression()
linear_reg_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = linear_reg_pca.predict(X_test_pca)
r2_after_pca = r2_score(y_test_pca, y_pred_pca)
print(f"R2 Score after PCA: {r2_after_pca}")

if r2_after_pca > r2_before_pca:
    print("\nPerformance improved after PCA.")
elif r2_after_pca < r2_before_pca:
    print("\nPerformance decreased after PCA.")
else:
    print("\nPerformance remained the same after PCA.")


R2 Score before PCA: 0.804442243576259
R2 Score after PCA: 0.7500675882701553

Performance decreased after PCA.
