<a href="https://colab.research.google.com/github/gelechi/Gloze/blob/main/Copy_of_cours_32_graphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
df = sns.load_dataset("mpg")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [None]:
## Let's convert to metric style

# lbs to tons (1000 kg)
df['weight'] = df['weight'] * 0.4536 / 1000
# mpg to l/100km
df['fuel_consumption'] = 235.215 / df['mpg']
# cubic inches to liter
df['displacement'] = df['displacement'] * 0.0164

In [None]:
# All possible lines

x = df['weight']
y = df['fuel_consumption']

fig = go.Figure()
fig.add_trace(go.Scatter(name='', x=x, y=y, mode='markers')) #, range_x=(0,6000), range_y=(0, 50), width=800, height=500))

# Add regression line
intercept = -0.1
coef = 9
color = 'darkorchid'
x_line = x
y_line = intercept + coef*x_line,
fig.add_trace(go.Scatter(name='', x=x_line, y=intercept + coef*x_line, mode='lines', line=dict(color=color, dash='dash')))

# Too low
intercept = 0
coef = 6
color = 'green'
x_line = x
y_line = intercept + coef*x_line,
fig.add_trace(go.Scatter(name='', x=x_line, y=intercept + coef*x_line, mode='lines', line=dict(color=color, dash='dash')))

# Too high
intercept = 0
coef = 12
color = 'orange'
x_line = x
y_line = intercept + coef*x_line,
fig.add_trace(go.Scatter(name='', x=x_line, y=intercept + coef*x_line, mode='lines', line=dict(color=color, dash='dash')))

fig.update_layout(xaxis_title='weight (1000 kg)', yaxis_title='fuel consumption (l/100km)')
fig.update_layout(autosize=False, width=700, height=500)
fig.update_layout(legend=dict(visible=False))
fig.show()


In [None]:
# With error bars

intercept = -0.1
coef = 9
color = 'darkorchid'

# Too low
# intercept = 0
# coef = 6
# color = 'green'

# Too high
# intercept = 0
# coef = 12
# color = 'orange'

x = df['weight']
y = df['fuel_consumption']
x_line = df['weight'].copy()
y_line = intercept + coef*x_line,

fig = go.Figure()
fig.add_trace(go.Scatter(name='', x=x, y=y, mode='markers')) #, range_x=(0,6000), range_y=(0, 50), width=800, height=500))

# Add regression line
fig.add_trace(go.Scatter(name='', x=x_line, y=intercept + coef*x_line, mode='lines', line=dict(color=color, dash='dash')))

# Add error bars
x_vals = fig.data[0]['x']
errors = {d['mode']: d['y'] for d in fig.data}
shapes = []
for i, x in enumerate(x_vals):
    shapes.append(go.layout.Shape(type="line",
                                    x0=x,
                                    y0=errors['markers'][i],
                                    x1=x,
                                    y1=errors['lines'][i],
                                    line=dict(
                                        color='black',
                                        width=1),
                                    opacity=0.5,
                                    layer="above")
                 )
# fig.update_layout(shapes=shapes)

fig.update_layout(xaxis_title='weight (1000 kg)', yaxis_title='fuel consumption (l/100km)')
fig.update_layout(autosize=False, width=700, height=500)
fig.update_layout(legend=dict(visible=False))
fig.show()


In [None]:
# Best model, with intercept shown

intercept = -0.1
coef = 9
color = 'darkorchid'

x = df['weight']
y = df['fuel_consumption']
x_line = pd.concat([pd.Series([0], index=[-1]), x, pd.Series([3],index=[398])], axis=0)
y_line = intercept + coef*x_line,

fig = go.Figure()
fig.add_trace(go.Scatter(name='', x=x, y=y, mode='markers')) #, range_x=(0,6000), range_y=(0, 50), width=800, height=500))

# Add regression line
fig.add_trace(go.Scatter(name='', x=x_line, y=intercept + coef*x_line, mode='lines', line=dict(color=color, dash='dash')))

fig.update_layout(xaxis_title='weight (1000 kg)', yaxis_title='fuel consumption (l/100km)')
fig.update_layout(autosize=False, width=700, height=500)
fig.update_layout(legend=dict(visible=False))
fig.show()


## Data Cleaning

In [None]:
from sklearn.impute import SimpleImputer

# Imputer the mean where we don't have data
imputer = SimpleImputer(strategy='mean')
df['horsepower'] = imputer.fit_transform(df[['horsepower']])
df.isnull().sum()

mpg                 0
cylinders           0
displacement        0
horsepower          0
weight              0
acceleration        0
model_year          0
origin              0
name                0
fuel_consumption    0
dtype: int64

# Linear Regression with One Features

In [None]:
X = df[['weight']]
y = df['fuel_consumption']

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X, y)

In [None]:
np.round(reg.coef_, 3)

array([8.992])

In [None]:
np.round(reg.intercept_,3)

-0.903

# Linear Regression with Multiple Features

In [None]:
X = df[['weight',
        'displacement',
        'horsepower',]]
y = df['fuel_consumption']

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X, y)

In [None]:
reg.coef_

array([5.35754041, 0.22512794, 0.03154875])

In [None]:
reg.intercept_

-0.01561755823265365