# Connect To G-Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
dir_files = '/content/drive/MyDrive/00. Digital Skola/H1 2022/19. Regression'
os.chdir(dir_files)

# Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from helper_function import (
    make_linear, linreg, linreg_a, linreg_ab, linreg_loss
)

from sklearn.linear_model import LinearRegression
from IPython.display import Image

%matplotlib inline

## Helper Functions

In [None]:
def plot_data(X1, X2, y1, y2):
    plt.figure(figsize=(15, 5))
    plt.subplot(121)
    plt.scatter(X1, y1, s=10, c='r')
    plt.subplot(122)
    plt.scatter(X2, y2, s=10, c='r');

def plot_pred(X1, X2, y1, y2, model, span=(0, 5)):
    X_pred = np.linspace(span[0], span[1], 100).reshape(-1, 1)

    plt.figure(figsize=(15,5))

    plt.subplot(121)
    plt.scatter(X1, y1, s=10, c='r')
    plt.title(f"R2_train: {model.score(X_train, y_train):.3f}")
    plt.plot(X_pred, model.predict(X_pred), 'k-')

    plt.subplot(122)
    plt.scatter(X2, y2, s=10, c='r')
    plt.plot(X_pred, model.predict(X_pred), 'k-')
    plt.title(f"R2_test: {model.score(X_test, y_test):.3f}");

# Straight Line Equation

In [None]:
Image("assets/straight_line_eq.png")

# Can you find the best fit?

In [None]:
linreg()

# Import Data

In [None]:
X_train, X_test, y_train, y_test = make_linear(coef=[0.5, 5], noise=0.1, test_size=0.2)

plot_data(X_train, X_test, y_train, y_test)

Define Loss: Karena kita butuh suatu angka yang merepresentasikan kesalahan kita

In [None]:
linreg_loss()

In [None]:
linreg_a()

In [None]:
linreg_ab()

# Linear Regression without Bias

In [None]:
lr = LinearRegression(fit_intercept=False)
lr.fit(X_train, y_train)
plot_pred(X_train, X_test, y_train, y_test, lr)

# Linear Regression with Bias

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
plot_pred(X_train, X_test, y_train, y_test, lr)

# Underfit Example

In [None]:
Image("assets/avp_underfit.png", width=600)

# Overfit Example

In [None]:
Image("assets/avp_overfit.png", width=600)

# Regularization

In [None]:
from helper_function import reg_coef, reg_ridge_lasso, reg_elastic, plot_coef, make_sine
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Import Sine Data

In [None]:
X_train, X_test, y_train, y_test = make_sine(noise=0.2, test_size=0.2)
plot_data(X_train, X_test, y_train, y_test)

In [None]:
reg_coef()

In [None]:
reg_ridge_lasso()

# Regression + L1 regularization = LASSO Regression

In [None]:
model = Pipeline([
    ("poly", PolynomialFeatures(10)),
    ("lr", Lasso(0.01))
])
model.fit(X_train, y_train)
plot_coef(X_train, X_test, y_train, y_test, model)

# Regression + L2 regularization = Ridge Regression

In [None]:
model = Pipeline([
    ("poly", PolynomialFeatures(10)),
    ("lr", Ridge(0.01))
])
model.fit(X_train, y_train)
plot_coef(X_train, X_test, y_train, y_test, model)

# Decision Tree

# Import Dataset

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('diabetes.csv')
df.head()

In [None]:
# banyak baris, banyak kolom
df.shape

# Check NaN or Missing Value

In [None]:
df.info()

In [None]:
#renaming data columns
rename_dict = {
    'Pregnancies':'pregnant',
    'Glucose':'glucose',
    'BloodPressure':'bp',
    'DiabetesPedigreeFunction': 'pedigree',
    'Age':'age',
    'Insulin':'insulin',
    'BMI':'bmi',
    'Outcome': 'label',
    'SkinThickness': 'skin_thickness'
}
df = df.rename(columns = rename_dict)
df.head()

# Data Modelling

### Feature Selection

In [None]:
df.columns

In [None]:
feature_cols = ['pregnant', 'glucose', 'bp', 'insulin', 'bmi', 'pedigree', 'age']

In [None]:
X = df[feature_cols]
y = df.label # 0 or 1, non-diab or diab

### Splitting Data

In [None]:
df.label.unique()

In [None]:
df.label.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.3,
    random_state = 42,
    stratify = y
)

In [None]:
print('Shape of Data Training: ', (X_train.shape, y_train.shape))
print('Shape of Data Testing: ', (X_test.shape, y_test.shape))

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

### Decision Tree Model

In [None]:
#Create Decision Tree Classifier Object
clf = DecisionTreeClassifier()

#Train Decision Tree Classifier
clf.fit(X_train, y_train) #Supervised

#Predict the response for test dataset
y_pred = clf.predict(X_test)

### Training and Testing Score (Evaluation Score)

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test) #overfit

In [None]:
print('Accuracy: ', accuracy_score(y_test, y_pred))

### Visualizing Decision Tree

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus

In [None]:
dot_data = StringIO()
export_graphviz(clf, out_file = dot_data,
                filled = True, rounded = True,
                special_characters = True, feature_names = feature_cols,
                class_names = ['0', '1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('diabetes.png')
Image(graph.create_png())

In [None]:
data_dict = {'Actual_Value': y_test, 'Predicted_Value': y_pred}
result_comparation = pd.DataFrame(data_dict)
result_comparation

In [None]:
result_comparation.Actual_Value.value_counts()

In [None]:
result_comparation.Predicted_Value.value_counts()

### Data Evaluation/Optimizing Decision Tree Performance

In [None]:
#Create Decision Tree Classifier Object
clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, min_samples_split=5,  )

#Train Decision Tree Classifier
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_train = clf.predict(X_train)

print('Accuracy: ', accuracy_score(y_train, y_pred_train))

In [None]:
y_pred_test = clf.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred_test))

In [None]:
dot_data = StringIO()
export_graphviz(clf, out_file = dot_data,
                filled = True, rounded = True,
                special_characters = True, feature_names = feature_cols,
                class_names = ['0', '1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('diabetes_2.png')
Image(graph.create_png())