### Importing modules

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib notebook

### Loading dataset

In [3]:
houses_df = pd.read_csv("input.csv")

In [4]:
houses_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


### Selecting the feature vectors

In [5]:
keep_col = [
    "price",
    "bedrooms",
    "sqft_lot",
    "floors"
]

houses_df = houses_df[keep_col]
houses_df.columns= ["price", "bedrooms", "sqft", "floors"]

In [6]:
houses_df.head()

Unnamed: 0,price,bedrooms,sqft,floors
0,221900.0,3,5650,1.0
1,538000.0,3,7242,2.0
2,180000.0,2,10000,1.0
3,604000.0,4,5000,1.0
4,510000.0,3,8080,1.0


### Normalization using mean and standard deviation

In [81]:
houses_mean = houses_df.mean()
houses_std = houses_df.std()

norm_df = houses_df - houses_mean
norm_df = houses_df / houses_std

y = norm_df["price"]
norm_df = norm_df.drop(labels=["price"], axis=1)

In [82]:
norm_df.head()

Unnamed: 0,bedrooms,sqft,floors
0,3.225592,0.136406,1.85189
1,3.225592,0.174841,3.70378
2,2.150395,0.241426,1.85189
3,4.300789,0.120713,1.85189
4,3.225592,0.195072,1.85189


In [83]:
m = len(norm_df)
theta = pd.DataFrame(columns=["theta"])

for index, value in enumerate(norm_df.columns.values):
    theta.loc[index] = 0

norm_df = norm_df.astype(np.float64)

In [84]:
print(theta.shape)
print(norm_df.shape)

norm_df.head()

(3, 1)
(21613, 3)


Unnamed: 0,bedrooms,sqft,floors
0,3.225592,0.136406,1.85189
1,3.225592,0.174841,3.70378
2,2.150395,0.241426,1.85189
3,4.300789,0.120713,1.85189
4,3.225592,0.195072,1.85189


### Compute Cost function for linear equation

In [85]:
def compute_cost(norm_df, y, theta, m):
    h_x = np.dot(norm_df, theta)
    h_x = pd.DataFrame(data=h_x)
    h_x = h_x[0]
    diff_val = h_x.subtract(y)
    square_val = diff_val ** 2
    sum_val = np.sum(square_val)
    J = sum_val / (2 * m)
    return J

### Compute Gradient descent 

In [86]:
J_history = []
def gradient(norm_df, y, theta, m, alpha):
    global J_history
    h_x = np.dot(norm_df, theta)
    h_x = pd.DataFrame(data=h_x)
    h_x = h_x[0]
    diff_val = h_x.subtract(y)
    prod_val = norm_df.apply(lambda x: x * diff_val)
    prod_val = prod_val.sum()
    delta = prod_val / m
    alpha_delta = delta * alpha
    theta = theta.apply(lambda x:x - alpha_delta.values)
    J_history.append(compute_cost(norm_df, y, theta, m))
    return theta

### Perform Linear Regression finding optimum values for theta

In [87]:
alpha = 0.0001
num_iters = 400
J_alpha = []
alpha_val = []
while alpha < 10:
    J_history = []
    for i in range(num_iters):
        theta = gradient(norm_df, y, theta, m, alpha)
    J_alpha.append(J_history)
    alpha_val.append(alpha)
    alpha *= 10

In [88]:
J_alpha = pd.DataFrame(J_alpha)
J_alpha.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,1.576986,1.571921,1.566878,1.561857,1.556859,1.551883,1.546928,1.541996,1.537085,1.532196,...,0.6339003,0.6329981,0.6320999,0.6312056,0.6303153,0.629429,0.6285465,0.6276679,0.6267933,0.6259224
1,0.6173389,0.6091304,0.6012806,0.5937738,0.586595,0.5797298,0.5731646,0.5668863,0.5608822,0.5551404,...,0.4288538,0.4288525,0.4288512,0.42885,0.4288487,0.4288475,0.4288462,0.428845,0.4288437,0.4288425
2,0.4288301,0.4288179,0.428806,0.4287944,0.4287829,0.4287717,0.4287608,0.42875,0.4287394,0.4287291,...,0.4282158,0.4282158,0.4282158,0.4282158,0.4282158,0.4282158,0.4282158,0.4282157,0.4282157,0.4282157
3,0.4282157,0.4282157,0.4282156,0.4282156,0.4282156,0.4282156,0.4282156,0.4282156,0.4282155,0.4282155,...,2.3178530000000002e+32,3.39264e+32,4.965805e+32,7.268444e+32,1.063882e+33,1.557203e+33,2.279276e+33,3.336175e+33,4.8831570000000003e+33,7.147472e+33
4,3.181627e+36,1.41627e+39,6.304384e+41,2.806334e+44,1.249212e+47,5.560743e+49,2.47531e+52,1.1018599999999999e+55,4.904819e+57,2.183332e+60,...,,,,,,,,,,


In [90]:
J_alpha.replace([np.inf, -np.inf], np.nan)
J_alpha = J_alpha.fillna(0)
J_alpha.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,1.576986,1.571921,1.566878,1.561857,1.556859,1.551883,1.546928,1.541996,1.537085,1.532196,...,0.6339003,0.6329981,0.6320999,0.6312056,0.6303153,0.629429,0.6285465,0.6276679,0.6267933,0.6259224
1,0.6173389,0.6091304,0.6012806,0.5937738,0.586595,0.5797298,0.5731646,0.5668863,0.5608822,0.5551404,...,0.4288538,0.4288525,0.4288512,0.42885,0.4288487,0.4288475,0.4288462,0.428845,0.4288437,0.4288425
2,0.4288301,0.4288179,0.428806,0.4287944,0.4287829,0.4287717,0.4287608,0.42875,0.4287394,0.4287291,...,0.4282158,0.4282158,0.4282158,0.4282158,0.4282158,0.4282158,0.4282158,0.4282157,0.4282157,0.4282157
3,0.4282157,0.4282157,0.4282156,0.4282156,0.4282156,0.4282156,0.4282156,0.4282156,0.4282155,0.4282155,...,2.3178530000000002e+32,3.39264e+32,4.965805e+32,7.268444e+32,1.063882e+33,1.557203e+33,2.279276e+33,3.336175e+33,4.8831570000000003e+33,7.147472e+33
4,3.181627e+36,1.41627e+39,6.304384e+41,2.806334e+44,1.249212e+47,5.560743e+49,2.47531e+52,1.1018599999999999e+55,4.904819e+57,2.183332e+60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Plot cost function vs number of iterations for different alpha values

In [157]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True, sharey=False)
line1, = ax1.plot(np.arange(0, 400), J_alpha.iloc[0], color='r', linewidth=0.8)
line2, = ax2.plot(np.arange(0, 400), J_alpha.iloc[1], color='g', linewidth=0.8)
line3, = ax3.plot(np.arange(0, 400), J_alpha.iloc[2], color='b', linewidth=0.8)
line4, = ax4.plot(np.arange(0, 400), J_alpha.iloc[3], color='orange', linewidth=0.8)
_ = plt.figlegend(handles=[line1, line2, line3, line4], 
                  labels=["alpha: "+str(alpha_val[0]), "alpha: "+str(alpha_val[1]), "alpha: "+str(alpha_val[2]), "alpha: "+str(alpha_val[3])], 
                  ncol = 2,
                  loc = "upper right")
_ = fig.text(0.5, 0.03, 'Number of Iterations', ha='center', va='center')
_ = fig.text(0.02, 0.5, 'Cost function values', ha='center', va='center', rotation='vertical')
_ = fig.suptitle("IPL Team Performance Analysis between 2008 - 2016", fontsize=12)

<IPython.core.display.Javascript object>