In [5]:
import pandas as pd
import numpy as np
import math

training_set = pd.read_csv('crime-train.txt', delimiter = '\t')
testing_set = pd.read_csv('crime-test.txt', delimiter = '\t')

print("training_set shape:", training_set.shape)
print("testing_set shape:", testing_set.shape)

training_set shape: (1595, 96)
testing_set shape: (399, 96)


In [8]:
# expected outcomes (1st column) in y_train
y_train = training_set['ViolentCrimesPerPop']
print("y_train shape:", y_train.shape)

# rest for prediction in x_train
x_train = training_set.drop('ViolentCrimesPerPop',axis=1)
print("x_train shape:", x_train.shape)

# convert values from string to float
x_train = np.float64(x_train)
y_train = np.float64(y_train)

# append 1s to the end of x_train
ones = np.ones(len(x_train))
x_train = np.column_stack((x_train, ones))

y_train shape: (1595,)
x_train shape: (1595, 95)


In [11]:
y_test = training_set['ViolentCrimesPerPop']
print("y_test shape:", y_test.shape)
x_test = training_set.drop('ViolentCrimesPerPop',axis=1)
print("x_test shape:", x_test.shape)

x_test = np.float64(x_test)
y_test = np.float64(y_test)

ones = np.ones(len(x_test))
x_test = np.column_stack((x_test, ones))

y_test shape: (1595,)
x_test shape: (1595, 95)


In [12]:
# function that computs RMSE (Root Mean Square Error) using predicted outcome matrix and the actual outcome matrix
def RMSE(prediction, actual):
    n = len(prediction)
    diff = prediction - actual
    total = 0
    
    for instance in diff:
        total += instance ** 2
        
    total_error = math.sqrt(total/n)
    return total_error

In [23]:
# performs linear regression and return list of predicted outcomes that each correspond to their actual values
def problem1(samples):
    part1 = np.linalg.inv(np.dot(x_train.T, x_train))
    part2 = np.dot(x_train.T, y_train)
    theta = np.dot(part1, part2)
    prediction = []
    for xtest in samples:
        prediction.append(np.dot(xtest.T, theta))
        
    return prediction

# making predictions with linear regression and calculate RMSE for training & testing data
train_linear_prediction = problem1(x_train)
# print(train_linear_prediction)
train_linear_RMSE = RMSE(train_linear_prediction, y_train)
print("Training Linear Regression RMSE:", train_linear_RMSE)

test_linear_prediction = problem1(x_test)
# print(test_linear_prediction)
test_linear_RMSE = RMSE(test_linear_prediction, y_test)
print("Testing Linear Regression RMSE:", test_linear_RMSE)

Training Linear Regression RMSE: 0.12768967421762184
Testing Linear Regression RMSE: 0.12768967421762184


In [25]:
# performs ridge regression and return list of predicted outcomes that each correspond to their actual values
def problem2(samples):
    lambda_val = 100
    part1 = np.linalg.inv(np.dot(x_train.T, x_train) + (lambda_val * np.identity(len(x_train.T))))
    part2 = np.dot(x_train.T, y_train)
    theta = np.dot(part1, part2)
    prediction = []
    for xtest in samples:
        prediction.append(np.dot(xtest.T, theta))
        
    return prediction

# make predictions with ridge regression and calculate RMSE for training & testing data
train_ridge_prediction = problem2(x_train)
# print(train_ridge_prediction)
train_ridge_RMSE = RMSE(train_ridge_prediction, y_train)
print("Training Ridge Regression RMSE:", train_ridge_RMSE)

test_ridge_prediction = problem2(x_test)
# print(test_ridge_prediction)
test_ridge_RMSE = RMSE(test_ridge_prediction, y_test)
print("Testing Ridge Regression RMSE:", test_ridge_RMSE)

Training Ridge Regression RMSE: 0.1313432042461578
Testing Ridge Regression RMSE: 0.1313432042461578


**RMSE Values**

Linear Regression
- Training: 0.12768967421762184
- Test:     0.12768967421762184

Ridge Regression:
- Training: 0.1313432042461578
- Test:     0.1313432042461578