# Linear Regression Example

This notebook uses a sampling of data taken from the Buzzfeed news Github repository on the analysis of surveillance planes found [here](https://github.com/BuzzFeedNews/2016-04-federal-surveillance-planes)

In [None]:
#get the packages we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#read in data
data = pd.read_csv('../data/demo-notebooks-data/simple_example.csv')

In [None]:
#look at first few rows
data.head()

In [None]:
#plot data
fig, ax = plt.subplots()
ax.scatter(data['speed (x)'],data['altitude (y)'], c = 'blue')

#format plot titles
ax.set_title('Altitude vs Speed')
ax.set_xlabel('Speed (knots)')
ax.set_ylabel('Altitude (feet)')

plt.show()

# Our calculated predictions

In [None]:
#create line from our calculated model
x = data['speed (x)']
y_pred = [(num*40) + 3000 for num in data['speed (x)']]

#plot data points and line
fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(x, y_pred, c = 'red')
ax.scatter(data['speed (x)'],data['altitude (y)'], c = 'blue')

#format plot titles
ax.set_title('Altitude vs Speed')
ax.set_xlabel('Speed (knots)')
ax.set_ylabel('Altitude (feet)')

plt.show()

In [None]:
#create table of predictions
data['our_predicted_y'] = y_pred
data.head()

In [None]:
#plot data points, line, and error measurements
fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(data['speed (x)'], data['our_predicted_y'], c = 'red')
ax.scatter(data['speed (x)'],data['altitude (y)'], c = 'blue')

#format plot titles
ax.set_title('Altitude vs Speed')
ax.set_xlabel('Speed (knots)')
ax.set_ylabel('Altitude (feet)')

for row in data.index:
    y_true = data.loc[row, 'altitude (y)']
    y_pred = data.loc[row, 'our_predicted_y']
    
    if y_true > y_pred:
        y_min = y_pred
        y_max = y_true
    else:
        y_min = y_true
        y_max = y_pred
        
    ax.vlines(x=data.loc[row, 'speed (x)'], ymin=y_min, ymax=y_max, colors='green', ls=':', lw=2, label='vline_single - partial height')

plt.show()

In [None]:
#print evaluations metrics
our_MSE = round(np.sqrt(mean_squared_error(data['altitude (y)'], data['our_predicted_y'])), 0)
our_r2 = round(r2_score(data['altitude (y)'], data['our_predicted_y']),2)
print('The MSE for the model calculations is ' + str(our_MSE) + ' and R_squared is ' + str(our_r2))


# Linear Regression Model

In [None]:
#create the X and y data from our dataset and fit a linear regression model
X = np.array(data['speed (x)']).reshape(-1, 1)
y = np.array(data['altitude (y)']).reshape(-1, 1)
model = LinearRegression()
model.fit(X,y)

In [None]:
#print the slope and intercept that were calculated by the model
slope = round(float(model.coef_), 1)
intercept = int(model.intercept_)

print('The slope of the linear regression model is ' + str(slope) + 
      ' and the intercept is ' + str(intercept))

In [None]:
#create a column of the models predictions
data['model_predicted_y'] = data['speed (x)'].apply(lambda x: (slope * x) + intercept)

#create line from calculated model
x = np.linspace(start=0, stop=193, num=100)
y = [(num*slope) + intercept for num in x]

#plot data points and new line
fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(x, y, c = 'red')
ax.scatter(data['speed (x)'],data['altitude (y)'], c = 'blue')

#format plot titles
ax.set_title('Altitude vs Speed')
ax.set_xlabel('Speed (knots)')
ax.set_ylabel('Altitude (feet)')

for row in data.index:
    y_true = data.loc[row, 'altitude (y)']
    y_pred = data.loc[row, 'model_predicted_y']
    
    if y_true > y_pred:
        y_min = y_pred
        y_max = y_true
    else:
        y_min = y_true
        y_max = y_pred
        
    ax.vlines(x=data.loc[row, 'speed (x)'], ymin=y_min, ymax=y_max, colors='green', ls=':', lw=2, label='vline_single - partial height')

plt.show()

In [None]:
#look at our predicted values
data.head()

In [None]:
#print evaluations metrics
model_MSE = round(mean_squared_error(y, data['model_predicted_y']),0)
model_r2 = round(r2_score(y, data['model_predicted_y']),2)
print('The MSE for the model calculations is ' + str(model_MSE) + ' and R_squared is ' + str(model_r2))