In [2]:
# Import linear regression from sklearn and time
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import time

df = pd.read_csv('../data/AmesHousing.csv')


# Prepare X (feature) and y (target)
X = df[['Overall Qual']].values
y = df['SalePrice'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create model and measure training time
model = LinearRegression()

start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

training_time = end_time - start_time

# Get the score (R²)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Training time: {training_time:.4f} seconds")
print(f"Training R² score: {train_score:.3f}")
print(f"Testing R² score: {test_score:.3f}")

Training time: 0.0009 seconds
Training R² score: 0.632
Testing R² score: 0.651


In [3]:
import numpy as np
import time

class SimpleLinearRegression:
    def fit(self, X, y):
        # Add bias term
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        
        # Calculate weights using normal equation: w = (X^T X)^(-1) X^T y
        self.weights = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
        return self
    
    def predict(self, X):
        # Add bias term and multiply by weights
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        return X_b.dot(self.weights)
    
    def score(self, X, y):
        # Calculate R² score
        y_pred = self.predict(X)
        u = ((y - y_pred) ** 2).sum()
        v = ((y - y.mean()) ** 2).sum()
        return 1 - (u/v)

# Split data function
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    n_samples = len(X)
    n_test = int(n_samples * test_size)
    indices = np.random.permutation(n_samples)
    test_idx, train_idx = indices[:n_test], indices[n_test:]
    
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

# Use the same data preparation as before
X = df[['Overall Qual']].values
y = df['SalePrice'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train model
model = SimpleLinearRegression()

start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

training_time = end_time - start_time

# Get the scores
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Training time: {training_time:.4f} seconds")
print(f"Training R² score: {train_score:.3f}")
print(f"Testing R² score: {test_score:.3f}")

Training time: 0.0007 seconds
Training R² score: 0.632
Testing R² score: 0.651


In [4]:
import random
import time

def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        random.seed(random_state)
    
    data = list(zip(X, y))
    random.shuffle(data)
    
    split_idx = int(len(data) * (1 - test_size))
    train_data = data[:split_idx]
    test_data = data[split_idx:]
    
    X_train = [x[0] for x in train_data]
    y_train = [x[1] for x in train_data]
    X_test = [x[0] for x in test_data]
    y_test = [x[1] for x in test_data]
    
    return X_train, X_test, y_train, y_test

class LinearRegression:
    def fit(self, X, y):
        # Calculate means
        x_mean = sum(X) / len(X)
        y_mean = sum(y) / len(y)
        
        # Calculate coefficients
        numerator = sum((x - x_mean) * (y - y_mean) for x, y in zip(X, y))
        denominator = sum((x - x_mean) ** 2 for x in X)
        
        self.slope = numerator / denominator
        self.intercept = y_mean - self.slope * x_mean
        
    def predict(self, X):
        return [self.slope * x + self.intercept for x in X]
    
    def score(self, X, y):
        y_pred = self.predict(X)
        ss_res = sum((yi - fi) ** 2 for yi, fi in zip(y, y_pred))
        ss_tot = sum((yi - (sum(y) / len(y))) ** 2 for yi in y)
        return 1 - (ss_res / ss_tot)

# Assuming df['Overall Qual'] and df['SalePrice'] are lists
X = df['Overall Qual']  # Should be a list of values
y = df['SalePrice']     # Should be a list of values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create model and measure training time
model = LinearRegression()

start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

training_time = end_time - start_time

# Get the score (R²)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Training time: {training_time:.4f} seconds")
print(f"Training R² score: {train_score:.3f}")
print(f"Testing R² score: {test_score:.3f}")

Training time: 0.0006 seconds
Training R² score: 0.632
Testing R² score: 0.664
