## Built-in Linear Regression Function

In [None]:
from plotnine import *
import pandas as pd
from sklearn.linear_model import LinearRegression # get linear regression function
import numpy as np



### Create data for fitting the model

In [None]:
x = [3,7,4,1,5] # hours of study
y = [7,10,6,4,8] # test score
df = pd.DataFrame(list(zip(x,y)), 
               columns =['x', 'y']) 
df

### Initialize the Linear Regression model

In [None]:
lm = LinearRegression()

### After initializing, we use the fit function that takes our independent and dependent variables to get the best-fit line

In [None]:
lm.fit(independent variable, dependent variable)

### The function takes a very specific type of inputs for x and y variables

In [None]:
lm.fit(df['x'],df['y'])

In [None]:
observed_x = df['x'].values.reshape(-1,1)
observed_y = df['y'].values.reshape(-1,1)
lm.fit(observed_x,observed_y)

### To get the parameters (b0 and b1) for the fitted model, we use the following commands

In [None]:
print(lm.coef_,lm.intercept_)

### Printing what the equation of the best-fit line looks like

In [None]:
print("The equation of the line is:")
print("y = ",lm.intercept_[0],"+",lm.coef_[0][0],"x")

### Let's also see what the model predicts our test scores should be for the given data set
### We use the predict function for that

In [None]:
predicted_y = lm.predict(observed_x)
predicted_y

### To get the R-squared value, we use the score function

In [None]:
from sklearn.metrics import r2_score
r2_score(observed_y,predicted_y)

### We can make a new data frame with x,y, and predicted y (y-hat) values

In [None]:
df = pd.DataFrame(list(zip(observed_x[:, 0],observed_y[:, 0],predicted_y[:, 0])), 
               columns =['x', 'y','y_hat']) 
df



### And plot the data with our predicted values

In [None]:
(
    ggplot(df,aes(x='x',y='y'))
    +geom_point()+geom_line(aes(y='y_hat'))
    +theme_bw()
)

### Q. What will happen when we introduce outliers in the data?

### Let's run a quick example using another data set
### We first load the data and see the underlying relationships

In [None]:
d = pd.read_csv("advertising.csv")
d.head()

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(d, figsize=(6, 6))

### Code to run Simple Linear Regression 

### Code to plot data and fitted line

## Validation

#### Let's use a built-in dataset from the sk-learn library

In [None]:
from sklearn import datasets

#### We can now load in a data using the load_x function

In [None]:
diabetes = datasets.load_diabetes()

In [None]:
print(diabetes.DESCR)

#### The diabetes data set isn't a data frame, just an array of more arrays

In [None]:
diabetes.values() # independent variables
diabetes.target # dependent variable

#### We first convert it into a data frame 

In [None]:
columns = "age sex bmi map tc ldl hdl tch ltg glu".split() # specify columns
df = pd.DataFrame(diabetes.data, columns=columns)
df.head()

#### Note that the target variable is not included in this data frame, we store it separately

In [None]:
y = diabetes.target

#### We use the train_test_split function to split our data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

#### Now we can split our data into a training set of x,y and a testing set of x,y variables

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

#### Let's apply a multiple linear regression model using all independent variables

#### We will fit the linear regression model using X_train and y_train but predict the values of X_test

In [None]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
predictions[0:5]

#### We can also compute the R-squared value

In [None]:
print(r2_score(y_test,predictions))

#### Q. Is this a good model?

#### How about we use a single independent variable to make our model?

#### But which variable do we select?

In [None]:
# make target part of the data 

In [None]:
# code to check something

In [None]:
# run linear regression