### Module 01 - Assignment

***
#### Environment
`conda activate sklearn-env`
***
#### Goals
   
- [Load the data sets from the links page](#Dataset-load-from-CSV-located-on-OpenML-website)
- [Print statistics about the data](#Print-statistics-about-the-data)
- [Plot correlation and heat maps](#Plot-correlation-and-heat-maps)
- [Optional](#Optional) *
  - [Plot linear regression](#Plot-linear-regression)
  - [Predict MEDV from CRIM, RM, INDUS, NOX](#Train-model-to-predict-MEDV-from-CRIM,-RM,-INDUS,-NOX)

#### Basic python imports for panda (dataframe) and seaborn(visualization) packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

In [None]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
data = fetch_openml("boston", version=1, as_frame=True)
dataset = data.frame.copy()
dataset['CHAS'] = pd.to_numeric(dataset['CHAS'])
dataset['RAD'] = pd.to_numeric(dataset['RAD'])

dataset.head()

### Print statistics about the data

#### Data description

In [None]:
print(data.DESCR)

#### Dataset meta information

In [None]:
dataset.info()

#### Display total count of missing values 

In [None]:
dataset.isna().sum()

#### Basic statistical properties

In [None]:
dataset.describe().transpose()[['mean', 'std', 'count', 'min', 'max']]

### Plot correlation and heat maps

#### Correlation matrix

In [None]:
corr = dataset.corr()
corr

#### Visualize correlation metrix using seaborn heatmap plot

https://seaborn.pydata.org/examples/many_pairwise_correlations.html

In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize = (12,8))
sns.heatmap(corr, mask = mask, annot=True, fmt='.2f', xticklabels=corr.columns.values,yticklabels=corr.columns.values,cmap="Greens")

## Optional


### Plot linear regression


https://seaborn.pydata.org/tutorial/regression.html

https://seaborn.pydata.org/generated/seaborn.pairplot.html

In [None]:
sns.pairplot(dataset, x_vars= ['CRIM', 'RM', 'INDUS', 'NOX'],y_vars= 'MEDV', height=5, aspect=.8, kind="reg")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 6))
sns.regplot(ax=axes[0], x='RM', y='MEDV', data=dataset, order=1, ci=None, line_kws={'color': 'red'});
sns.regplot(ax=axes[1], x='RM', y='MEDV', data=dataset, order=2, ci=None, line_kws={'color': 'red'});
sns.regplot(ax=axes[2], x='RM', y='MEDV', data=dataset, order=10, ci=None, line_kws={'color': 'red'});

#### Gradiend descent and cost function

In [None]:
def costFunction(X, y , theta):
    m = len(y)
    sqHipe = np.matmul(X , theta) - y
    cost = (1/(2*m)) * np.sum(sqHipe * sqHipe)
    return cost
    

def gradientDescent(X, y, theta, alpha, num_iter):
    m = len(y)
    jurnal = np.zeros(num_iter)
    theta_jurnal = np.zeros((num_iter, len(theta)))
    for iter in range(num_iter):
        theta = theta - alpha * (1/m) * np.sum(((np.matmul(X , theta) - y).transpose() * X.transpose()).transpose(), axis=0)
        jurnal[iter] = costFunction(X, y, theta)
        theta_jurnal[iter] = theta
    return theta, jurnal, theta_jurnal


### Train model to predict MEDV from CRIM, RM, INDUS, NOX

In [None]:
train_dataset = dataset[['MEDV', 'CRIM', 'RM', 'INDUS', 'NOX']]
#train_dataset = ... <select from dataset 'MEDV', 'CRIM', 'RM', 'INDUS', 'NOX' features>


In [None]:
train_features = train_dataset.copy()
train_labels = train_features.pop('MEDV')

stats = train_features.describe().transpose()[['mean', 'std', 'count', 'min', 'max']]
stats

normalized_train_features = (train_features - stats['mean'].transpose()) /  stats['std'].transpose()
normalized_train_features.tail()

normalized_ones_features = normalized_train_features.copy()
normalized_ones_features.insert(0, 'Oness', 1.0)
normalized_ones_features.head()


theta = np.zeros(len(normalized_ones_features.columns))
alpha = 0.01;
num_iters = 400;
theta , jurnal, theta_jurnal = gradientDescent(normalized_ones_features.to_numpy(), train_labels.to_numpy(), theta, alpha, num_iters);
print(f"Hypothesis: h(X)= {theta[0]:.3f} {theta[1]:+.3f}*CRIM {theta[2]:+.3f}*RM {theta[3]:+.3f}*INDUS {theta[4]:+.3f}*NOX")

#### Predict MEDV from CRIM, RM, INDUS, NOX

In [None]:
score_elem = np.array([0.03237, 6.998, 2.18, 0.458])
expected_prediction = 30.319424810512324

score_input = (score_elem -  stats['mean'].transpose()) /  stats['std'].transpose();
score_elem = np.insert(score_input.to_numpy(),0,1,axis=0)

test_mpg = np.matmul(score_elem , theta)
print("Predicted MPG:" ,test_mpg,  " expected value ", expected_prediction)