## [作業重點]
使用 Sklearn 中的線性迴歸模型，來訓練各種資料集，務必了解送進去模型訓練的**資料型態**為何，也請了解模型中各項參數的意義

## 作業
試著使用 sklearn datasets 的其他資料集 (wine, boston, ...)，來訓練自己的線性迴歸模型。

### HINT: 注意 label 的型態，確定資料集的目標是分類還是回歸，在使用正確的模型訓練！

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder


import warnings
warnings.filterwarnings(action='ignore')

In [69]:
def linear_regression(X, y, feature_name):
    lr = linear_model.LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.25)
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    mse = mean_squared_error(y_pred, y_test)
    print("Feature({})'s MSE = {}".format(feature_name, mse))
    
#     plt.scatter(X_test, y_test,  color='black')
#     plt.plot(X_test, y_pred, color='blue', linewidth=3)
#     plt.show()

In [19]:
def linear_regression_dataset(dataset):
    y = dataset['target']   
    for i, col  in enumerate(dataset['feature_names']):
        X= dataset['data'][:,i].reshape(-1,1)  
        linear_regression(X, y,col )

In [70]:
# Iris dataset
linear_regression_dataset(datasets.load_iris())

Feature(sepal length (cm))'s MSE = 0.35672896321658787
Feature(sepal width (cm))'s MSE = 0.5669946952321133
Feature(petal length (cm))'s MSE = 0.059669812281790996
Feature(petal width (cm))'s MSE = 0.06481420857819145


In [71]:
# Wine dataset
linear_regression_dataset(datasets.load_wine())

Feature(alcohol)'s MSE = 0.40580752950740745
Feature(malic_acid)'s MSE = 0.529264073702479
Feature(ash)'s MSE = 0.5021576927089174
Feature(alcalinity_of_ash)'s MSE = 0.4645576491191431
Feature(magnesium)'s MSE = 0.48203038663690023
Feature(total_phenols)'s MSE = 0.25923517852916533
Feature(flavanoids)'s MSE = 0.13760129550484554
Feature(nonflavanoid_phenols)'s MSE = 0.4426617024028009
Feature(proanthocyanins)'s MSE = 0.49017565219713277
Feature(color_intensity)'s MSE = 0.4917901586372669
Feature(hue)'s MSE = 0.4358452410728211
Feature(od280/od315_of_diluted_wines)'s MSE = 0.22693549079150727
Feature(proline)'s MSE = 0.3472264558700922


In [72]:

# boston dataset
linear_regression_dataset(datasets.load_boston())

Feature(CRIM)'s MSE = 90.68869652851386
Feature(ZN)'s MSE = 72.74917802571878
Feature(INDUS)'s MSE = 64.88754618231609
Feature(CHAS)'s MSE = 81.51108246484146
Feature(NOX)'s MSE = 69.06228375952307
Feature(RM)'s MSE = 33.99167992527972
Feature(AGE)'s MSE = 65.54004686014387
Feature(DIS)'s MSE = 89.10988942792073
Feature(RAD)'s MSE = 81.24940075471198
Feature(TAX)'s MSE = 71.09702394638059
Feature(PTRATIO)'s MSE = 58.050588513695345
Feature(B)'s MSE = 72.53994849585187
Feature(LSTAT)'s MSE = 44.80462196627315


In [73]:
# breast_cancer dataset
linear_regression_dataset(datasets.load_breast_cancer())


Feature(mean radius)'s MSE = 0.10356120155093489
Feature(mean texture)'s MSE = 0.19382821687889723
Feature(mean perimeter)'s MSE = 0.09220244879818384
Feature(mean area)'s MSE = 0.13740399719470928
Feature(mean smoothness)'s MSE = 0.1970520985524285
Feature(mean compactness)'s MSE = 0.14298529503922566
Feature(mean concavity)'s MSE = 0.11941304020947688
Feature(mean concave points)'s MSE = 0.10673520851826981
Feature(mean symmetry)'s MSE = 0.22630956376067116
Feature(mean fractal dimension)'s MSE = 0.24314087582595262
Feature(radius error)'s MSE = 0.1948579246841147
Feature(texture error)'s MSE = 0.23175575101408838
Feature(perimeter error)'s MSE = 0.14837317928622334
Feature(area error)'s MSE = 0.31253517068707554
Feature(smoothness error)'s MSE = 0.24020223686721592
Feature(compactness error)'s MSE = 0.20333793005921774
Feature(concavity error)'s MSE = 0.20543284601539888
Feature(concave points error)'s MSE = 0.17334491609036903
Feature(symmetry error)'s MSE = 0.22704546084942542
Fea

In [49]:
def logistic_regression_dataset(X, y):
    lr = linear_model.LogisticRegression()
    X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.25)
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    r2 = r2_score(y_pred, y_test)
    print("R2 Score = {}".format(r2)) 
    
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy Score = {}".format(acc))  

In [50]:
# Iris dataset
dataset = datasets.load_iris()
logistic_regression_dataset(dataset['data'], dataset['target'])

R2 Score = 0.9600840336134454
Accuracy Score = 0.9736842105263158


In [57]:
# Wine dataset
dataset = datasets.load_wine()
logistic_regression_dataset(dataset['data'], dataset['target'])

R2 Score = 0.9675324675324676
Accuracy Score = 0.9777777777777777


In [68]:
# boston dataset
dataset = datasets.load_boston()
le = LabelEncoder()
target = le.fit_transform(dataset['target'])  # target type is continue, discrete in Label encoder
logistic_regression_dataset(dataset['data'],target)

R2 Score = 0.617061724269157
Accuracy Score = 0.03937007874015748


In [38]:
# breast_cancer dataset
logistic_regression_dataset(datasets.load_breast_cancer())

R2 Score = 0.7516283108988276
Accuracy Score = 0.9440559440559441
