In [57]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Dasar

Implementasi dasar regresi linier dengan menggunakan modul Sklearn.

#### 1. Ordinary Least Squares

Source: https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py

In [77]:
data_x, data_y = datasets.load_diabetes(return_X_y=True, as_frame=True)

In [78]:
data_x.tail()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
437,0.041708,0.05068,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.05068,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.05068,-0.015906,0.017282,-0.037344,-0.01384,-0.024993,-0.01108,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.02656,0.044528,-0.02593
441,-0.045472,-0.044642,-0.07303,-0.081414,0.08374,0.027809,0.173816,-0.039493,-0.00422,0.003064


In [79]:
data_y.tail()

437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: target, dtype: float64

In [80]:
data_bmi = data_x[['bmi']]

# Split the data into training/testing sets
data_bmi_train = data_bmi[:-20]
data_bmi_test = data_bmi[-20:]

data_y_train = data_y[:-20]
data_y_test = data_y[-20:]

In [81]:
# Train
reg = linear_model.LinearRegression()
reg.fit(data_bmi_train, data_y_train)

LinearRegression()

In [83]:
# Predict
predictions = reg.predict(data_bmi_test)
predictions = pd.DataFrame(predictions, columns=['predictions'])

In [None]:
predictions

In [84]:
# Print hasil prediksi
print('Coefficients: {}'.format(reg.coef_))
print('Mean Squared Error (MSE): {}'.format(mean_squared_error(data_y_test, predictions)))
print('Mean Absolute Error (MAE): {}'.format(mean_absolute_error(data_y_test, predictions)))

Coefficients: [938.23786125]
Mean Squared Error (MSE): 2548.0723987259694
Mean Absolute Error (MAE): 41.227091289761454


In [96]:
# Visualisasi hasil prediksi
df_results = pd.DataFrame()
df_results['Feature Test'] = data_bmi_test['bmi'].copy()
df_results['Target Actual'] = data_y_test.copy()
df_results.reset_index(drop=True, inplace=True)
df_results['Target Predicted'] = predictions.copy()

# Create plotly figure
fig = px.scatter(df_results, x='Feature Test', y='Target Actual')
fig.add_scatter(x=df_results['Feature Test'], y=df_results['Target Predicted'], mode='lines', name='Predicted')
fig.show()

## Case Studies

In [5]:
data = 'https://raw.githubusercontent.com/rudyhendrawn/data-course/main/datasets/melbourne_housing_extra_data.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


In [6]:
df.isna().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Price             4344
Method               0
SellerG              0
Date                 0
Distance             8
Postcode             8
Bedroom2          4413
Bathroom          4413
Car               4413
Landsize          4796
BuildingArea     11123
YearBuilt        10389
CouncilArea       4444
Lattitude         4292
Longtitude        4292
Regionname           8
Propertycount        8
dtype: int64

In [7]:
len(df)

19740