## [作業重點]
使用 Sklearn 中的線性迴歸模型，來訓練各種資料集，務必了解送進去模型訓練的**資料型態**為何，也請了解模型中各項參數的意義

## 作業
試著使用 sklearn datasets 的其他資料集 (wine, boston, ...)，來訓練自己的線性迴歸模型。

### HINT: 注意 label 的型態，確定資料集的目標是分類還是回歸，在使用正確的模型訓練！

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder


import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def linear_regression(X, y, feature_name):
    lr = linear_model.LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.25)
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    print("Feature({})'s MSE = {}".format(feature_name, mse))
    
#     plt.scatter(X_test, y_test,  color='black')
#     plt.plot(X_test, y_pred, color='blue', linewidth=3)
#     plt.show()

In [3]:
def linear_regression_dataset(dataset):
    y = dataset['target']   
    for i, col  in enumerate(dataset['feature_names']):
        X= dataset['data'][:,i].reshape(-1,1)  
        linear_regression(X, y,col )

In [4]:
# Iris dataset
linear_regression_dataset(datasets.load_iris())

Feature(sepal length (cm))'s MSE = 0.2611793766278086
Feature(sepal width (cm))'s MSE = 0.6163194927910955
Feature(petal length (cm))'s MSE = 0.08818600733391584
Feature(petal width (cm))'s MSE = 0.06808618702861817


In [5]:
# Wine dataset
linear_regression_dataset(datasets.load_wine())

Feature(alcohol)'s MSE = 0.5497510086216122
Feature(malic_acid)'s MSE = 0.48237752949076695
Feature(ash)'s MSE = 0.5501674926583755
Feature(alcalinity_of_ash)'s MSE = 0.33866815681938156
Feature(magnesium)'s MSE = 0.5741710226662907
Feature(total_phenols)'s MSE = 0.24440437785760408
Feature(flavanoids)'s MSE = 0.15538229492967143
Feature(nonflavanoid_phenols)'s MSE = 0.49553422405435826
Feature(proanthocyanins)'s MSE = 0.629452523852481
Feature(color_intensity)'s MSE = 0.6070329375477855
Feature(hue)'s MSE = 0.3131666299870898
Feature(od280/od315_of_diluted_wines)'s MSE = 0.2320735823586739
Feature(proline)'s MSE = 0.3514605757396281


In [8]:

# boston dataset
linear_regression_dataset(datasets.load_boston())

Feature(CRIM)'s MSE = 70.60949754815445
Feature(ZN)'s MSE = 81.33626444584468
Feature(INDUS)'s MSE = 66.76775514221767
Feature(CHAS)'s MSE = 87.91189346283555
Feature(NOX)'s MSE = 77.30027290762206
Feature(RM)'s MSE = 37.86355618892622
Feature(AGE)'s MSE = 69.48341128413549
Feature(DIS)'s MSE = 86.06919858065737
Feature(RAD)'s MSE = 73.37520393860412
Feature(TAX)'s MSE = 68.7580569562007
Feature(PTRATIO)'s MSE = 61.33118703989507
Feature(B)'s MSE = 60.036332158574815
Feature(LSTAT)'s MSE = 48.44781345482666


In [7]:
# breast_cancer dataset
linear_regression_dataset(datasets.load_breast_cancer())


Feature(mean radius)'s MSE = 0.10912157027224138
Feature(mean texture)'s MSE = 0.17804942713192565
Feature(mean perimeter)'s MSE = 0.11264839717990176
Feature(mean area)'s MSE = 0.10492252547367155
Feature(mean smoothness)'s MSE = 0.1863727685971353
Feature(mean compactness)'s MSE = 0.15204295112520885
Feature(mean concavity)'s MSE = 0.12074156092415761
Feature(mean concave points)'s MSE = 0.09775263914067372
Feature(mean symmetry)'s MSE = 0.20151460782313496
Feature(mean fractal dimension)'s MSE = 0.2352482781302648
Feature(radius error)'s MSE = 0.1651033672106072
Feature(texture error)'s MSE = 0.23467101455171338
Feature(perimeter error)'s MSE = 0.1644290442767685
Feature(area error)'s MSE = 0.16162869016344394
Feature(smoothness error)'s MSE = 0.23719181993300065
Feature(compactness error)'s MSE = 0.2164225809407905
Feature(concavity error)'s MSE = 0.2162657214369466
Feature(concave points error)'s MSE = 0.20491908122116304
Feature(symmetry error)'s MSE = 0.24098518866059906
Feature

In [49]:
def logistic_regression_dataset(X, y):
    lr = linear_model.LogisticRegression()
    X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.25)
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    r2 = r2_score(y_pred, y_test)
    print("R2 Score = {}".format(r2)) 
    
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy Score = {}".format(acc))  

In [50]:
# Iris dataset
dataset = datasets.load_iris()
logistic_regression_dataset(dataset['data'], dataset['target'])

R2 Score = 0.9600840336134454
Accuracy Score = 0.9736842105263158


In [57]:
# Wine dataset
dataset = datasets.load_wine()
logistic_regression_dataset(dataset['data'], dataset['target'])

R2 Score = 0.9675324675324676
Accuracy Score = 0.9777777777777777


In [68]:
# boston dataset
dataset = datasets.load_boston()
le = LabelEncoder()
target = le.fit_transform(dataset['target'])  # target type is continue, discrete in Label encoder
logistic_regression_dataset(dataset['data'],target)

R2 Score = 0.617061724269157
Accuracy Score = 0.03937007874015748


In [38]:
# breast_cancer dataset
logistic_regression_dataset(datasets.load_breast_cancer())

R2 Score = 0.7516283108988276
Accuracy Score = 0.9440559440559441
