In [1]:
import pandas as pd
import numpy as np
from numpy.random import randint

class LReg:
    def __init__(self):
        """Class initialization"""
        self._df_usa_house = pd.DataFrame()
        self._x_num_cols = []
        self._y_num_cols = []
       
    def load_data(self, path_to_file, y_col = "Price"):
        """Load data in to DF"""
        self._df_usa_house = pd.read_csv(path_to_file)
        self._df_usa_house = self._df_usa_house.select_dtypes(include=np.number)
        
        self._x_num_cols = [colname for colname in self._df_usa_house.columns if not colname == y_col ]  
        self._y_num_cols.append(y_col)
    
    def split_data(self, ratio="8:2"):
        """
        Random split of data.
        """
        parts = ratio.split(":")
        train_part = int(((self._df_usa_house.shape[0] * int(parts[0]))/10.00))
        
        idxes = np.arange(self._df_usa_house.shape[0])
        np.random.shuffle(idxes)
        
        self._train_data = self._df_usa_house.iloc[idxes[:train_part]]
        self._test_data = self._df_usa_house.iloc[idxes[train_part:]]
        
        X_train = self._train_data[self._x_num_cols].to_numpy() 
        y_train = self._train_data[self._y_num_cols].to_numpy()
        
        X_test = self._test_data[self._x_num_cols].to_numpy()
        y_test = self._test_data[self._y_num_cols].to_numpy()
        
        assert X_train.shape[0] == y_train.shape[0]
        assert X_test.shape[0] == y_test.shape[0]
        
        print("Train: %d" %X_train.shape[0])
        print("Test: %d" %X_test.shape[0])
        
        return X_train, X_test, y_train, y_test
    
    def fit(self, X_train, y_train):
        """Ordinary Least Square for minimization of sum of squared error"""
        self._weight = np.linalg.inv(X_train.T.dot(X_train)).dot(X_train.T).dot(y_train)
        

    def predict(self, X_test, y_test):
        """Predict using the weight"""
        rmse = np.sqrt(np.mean((np.dot(X_test, self._weight) - y_test)**2))
        mae = np.mean(np.abs(np.dot(X_test, self._weight) - y_test))
        print("RMSE: ", rmse)
        print("MAE: ", mae)

In [2]:
LR = LReg() 
# data can be downloaded from here 
# https://www.kaggle.com/aariyan101/usa-housingcsv
# ! wget https://www.kaggle.com/aariyan101/usa-housingcsv /tmp/
LR.load_data("/tmp/USA_Housing.csv")
X_train, X_test, y_train, y_test = LR.split_data()
LR.fit(X_train, y_train)
LR.predict(X_test, y_test)

Train: 4000
Test: 1000
RMSE:  244707.12817922133
MAE:  195365.34641944533
