# Example 2 with large data set

In this example, we will use real data imported from csv file instead of made-up ones

In [None]:
# import everything we need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [None]:
# We are going to use the built-in data sets from sklearn
# This dataset contains boston housing infomation

# import datasets from sklearn first
from sklearn import datasets
boston = datasets.load_boston() # get built-in dataset, data has all the features, target is what we want to predict
print(boston)

In [None]:
# Construct a dataframe for our data and target for a better illustration
# set the columns names according to the result above
df = pd.DataFrame(boston.data, 
                  columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'])
df['MEDV'] = boston.target # add 'target' as the last column in the dataframe
df.head() # show the first 5 rows

# Data Visualization

In [None]:
# We can see that feature 'RM', average number of rooms per dwelling,
# has a linear relationship with our target 'MEDV', Median value of owner-occupied homes in $1000's

fig, ax = plt.subplots(figsize=(10,10)) 
ax.scatter(df['RM'], df['MEDV']) # plot the data
ax.grid()
plt.show()

## Model Training

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [None]:
# notice that we need double square brackets to select X
# inner bracket is a list
# outer bracker accesses a subset of the original DataFrame
X_train = df[['RM']]
y_train = df['MEDV']

X_train.head()

In [None]:
# We need 2D arrays for x_train 
print(X_train.shape)
print(y_train.shape)

In [None]:
# instantiate a linear regression model
lModel = linear_model.LinearRegression()

# train the model to fit the training data, finding the coef and intercept
model = lModel.fit(X_train, y_train)

# Prediction

Now that we've trained our model, the next step is to predict the data.

Steps:

    Get predictions, 𝑦̂, using LinearRegression.predict()


In [None]:
# We can do prediction based on single value
print(lModel.predict([[4.67]]))

In [None]:
y_pred = lModel.predict(X_train)

# Model Validation

We want to see how well our model fits the data. r2-score should be as close to 1 as possible

In [None]:
# we can see the coefficient and intercept our model finds
print(lModel.coef_) # the 'slope'
print(lModel.intercept_) # the intercept
print(lModel.score(X_train, y_train))

As the r2-score isn't very close to 1, our linear regression model doesn't really fit the data well
We can visualize it to see what the model looks like

In [None]:
plt.scatter(X_train, y_train)
plt.plot(X_train, y_pred, color='red')
plt.show()

In [None]:
# We want to plot the regression line(blue) vs the actual value(red)
fig, ax = plt.subplots(figsize = (10,10))

ax.plot(X_train, y_pred, c = 'red', label='Test data')
ax.scatter(X_train, y_train, c = 'blue', label='Test points')
ax.set(xlabel='average rooms per dwelling', ylabel='median of owner home / $1000')
ax.grid()
plt.show()



We can see that there are many scattered data points than affect how our model would be like