In [4]:
import numpy as np
import pandas as pd

In [5]:
    df = pd.read_csv("linear_regression_train.csv")
    
    # Assuming the last column is the target (y), and the rest are features (X)
    # X = df.iloc[:, 1:-1].values  # All columns except the last one
    X = df.iloc[:, 1:-1].values
    y = df.iloc[:, -1].values   # The last column
    
    # Add a column of ones to X for the intercept term (this will now be the "bias")
    
X.shape


(48000, 25)

In [6]:
X

array([[ -10.42983652,   21.68425064,   93.05632423, ...,  -99.42351759,
         -41.15065424,  113.11719747],
       [  12.47508125,   11.65201123,  -47.62188868, ...,  -97.69430387,
          13.81611278,    8.71189399],
       [   7.69618811,   24.72954661,   43.78238349, ..., -100.87486091,
         -38.8955983 ,   34.4883343 ],
       ...,
       [  13.87629202,   26.97679336,   86.95226218, ..., -102.18093317,
         -63.05664522,  109.05468792],
       [  -4.4227684 ,   19.5222766 ,   61.59642015, ...,  -85.61169692,
           2.95671083,    4.26772866],
       [   3.86605987,   15.22206782,  -54.96995473, ...,  -97.34426689,
        -136.72256202,   37.2040866 ]])

In [7]:
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X = (X - X_mean) / X_std  # Standardization
X

array([[-1.69864257,  0.25314326,  1.13869653, ..., -0.22286562,
        -0.47196937,  2.27252813],
       [ 0.67201663, -1.49897152, -2.0985691 , ...,  0.15226832,
         0.88583556, -0.21087511],
       [ 0.17740114,  0.78499939,  0.00481211, ..., -0.53771862,
        -0.41626433,  0.40224792],
       ...,
       [ 0.81704195,  1.17747749,  0.99823079, ..., -0.82105666,
        -1.01309748,  2.17589655],
       [-1.07691097, -0.1244421 ,  0.41474601, ...,  2.77345726,
         0.61758348, -0.31658482],
       [-0.21901717, -0.87546677, -2.26766168, ...,  0.228205  ,
        -2.83281419,  0.46684529]])

In [8]:
# Add a column of ones to X
X = np.hstack((np.ones((X.shape[0], 1)), X))
X

array([[ 1.        , -1.69864257,  0.25314326, ..., -0.22286562,
        -0.47196937,  2.27252813],
       [ 1.        ,  0.67201663, -1.49897152, ...,  0.15226832,
         0.88583556, -0.21087511],
       [ 1.        ,  0.17740114,  0.78499939, ..., -0.53771862,
        -0.41626433,  0.40224792],
       ...,
       [ 1.        ,  0.81704195,  1.17747749, ..., -0.82105666,
        -1.01309748,  2.17589655],
       [ 1.        , -1.07691097, -0.1244421 , ...,  2.77345726,
         0.61758348, -0.31658482],
       [ 1.        , -0.21901717, -0.87546677, ...,  0.228205  ,
        -2.83281419,  0.46684529]])

In [9]:
# Hyperparameters
learning_rate = 0.1    
epsilon = 0.00000001  # Convergence threshold
max_iterations = 10000000  # Max iterations to prevent infinite loop in case of non-convergence
m = len(y)
w = np.random.randn(X.shape[1]) * 0.01
prev_cost = 0


In [10]:
 for i in range(max_iterations):
        # Predicted values
        y_pred = X.dot(w)
        
        # Compute cost (Mean Squared Error)
        cost = (1 / (2 * m)) * np.sum((y_pred - y) ** 2)
        
        # Check for convergence: if the change in cost is smaller than epsilon, stop
        if abs( prev_cost - cost ) < epsilon:
            print(f"Convergence reached at iteration {i}, Cost: {cost}")
            break
        
        # Compute the gradient (derivative of cost function)
        gradient = (1 / m) * X.T.dot(y_pred - y)
        
        # Update weights using gradient descent rule
        w -= learning_rate * gradient
        
        # Update the previous cost for the next iteration
        prev_cost = cost
     
prediction_verification = X.dot(w)

Convergence reached at iteration 153, Cost: 58027.400244824705


In [11]:
prediction_verification

array([ 4843.77908261, -1070.69093969, -1075.54805419, ...,
       -1506.93123512,  4423.66257295, -2882.71396342])

In [12]:
y

array([ 5235.69092808, -1221.44214589, -1102.75886179, ...,
       -1703.21936555,  4551.70684165, -2800.86325604])

In [13]:
w

array([ 1.24697862e+03,  1.30449972e+02,  6.30916214e-01,  5.22752803e+02,
        7.35341341e+02, -4.23493560e-01,  8.11285139e+02,  4.42198497e+02,
       -1.00612151e+00,  6.47193307e+02,  2.25168294e+02,  7.86967238e-01,
        2.90441311e-01,  7.62095330e+02,  6.62220159e+02, -1.41989535e+00,
        5.27387198e+02,  3.16658418e+01,  4.52260720e+02,  4.12490588e+02,
        6.94742072e+02,  9.67825971e+01,  1.15875967e+00,  1.05488158e+02,
        4.92027074e+02,  8.04721146e-01])

In [14]:
w.shape

(26,)

In [15]:
X1 = pd.read_csv("linear_regression_test.csv")
X1 = X1.iloc[:, 1:].values
X1

array([[ -13.29664984,   26.94791326,   95.97781082, ...,  -93.08558458,
         -10.31051873,  -47.57996745],
       [  13.74047822,   26.6323546 ,   -1.05683434, ...,  -99.35181737,
           7.80600033,   75.69153932],
       [   5.75770087,    5.72928201,   -7.84631752, ..., -104.77186847,
         -29.85123744,   71.92018423],
       ...,
       [  -5.85261615,   12.03721863,   44.8103347 , ..., -109.65445576,
          18.41335887,   -6.97955382],
       [  20.43179344,    3.02571363,  104.60594893, ...,  -99.04817113,
         -58.06021961,    8.32577778],
       [   4.48130396,   27.63513075,   45.58004581, ...,  -97.28312767,
         -22.5433931 ,   -4.87210286]])

In [16]:
X1.shape

(12000, 25)

In [17]:
X = (X1 - X_mean) / X_std   # Standardization
X1

array([[ -13.29664984,   26.94791326,   95.97781082, ...,  -93.08558458,
         -10.31051873,  -47.57996745],
       [  13.74047822,   26.6323546 ,   -1.05683434, ...,  -99.35181737,
           7.80600033,   75.69153932],
       [   5.75770087,    5.72928201,   -7.84631752, ..., -104.77186847,
         -29.85123744,   71.92018423],
       ...,
       [  -5.85261615,   12.03721863,   44.8103347 , ..., -109.65445576,
          18.41335887,   -6.97955382],
       [  20.43179344,    3.02571363,  104.60594893, ...,  -99.04817113,
         -58.06021961,    8.32577778],
       [   4.48130396,   27.63513075,   45.58004581, ...,  -97.28312767,
         -22.5433931 ,   -4.87210286]])

In [18]:
X1 = np.hstack((np.ones((X1.shape[0], 1)), X1))
X1

array([[   1.        ,  -13.29664984,   26.94791326, ...,  -93.08558458,
         -10.31051873,  -47.57996745],
       [   1.        ,   13.74047822,   26.6323546 , ...,  -99.35181737,
           7.80600033,   75.69153932],
       [   1.        ,    5.75770087,    5.72928201, ..., -104.77186847,
         -29.85123744,   71.92018423],
       ...,
       [   1.        ,   -5.85261615,   12.03721863, ..., -109.65445576,
          18.41335887,   -6.97955382],
       [   1.        ,   20.43179344,    3.02571363, ...,  -99.04817113,
         -58.06021961,    8.32577778],
       [   1.        ,    4.48130396,   27.63513075, ...,  -97.28312767,
         -22.5433931 ,   -4.87210286]])

In [19]:
Y1 = X1.dot(w)
Y1

array([-132718.70532631,  -37952.92277601, -139521.38633766, ...,
        -71864.59228402, -234156.85457058, -193287.54471069])

In [3]:
#    !jupyter nbconvert --to script woking-model-on-multiple-regrssion.ipynb
#to make a script 