## [作業重點]
使用 Sklearn 中的線性迴歸模型，來訓練各種資料集，務必了解送進去模型訓練的**資料型態**為何，也請了解模型中各項參數的意義

## 作業
試著使用 sklearn datasets 的其他資料集 (wine, boston, ...)，來訓練自己的線性迴歸模型。

### HINT: 注意 label 的型態，確定資料集的目標是分類還是回歸，在使用正確的模型訓練！

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
wine = datasets.load_wine()
boston = datasets.load_boston()
breast_cancer = datasets.load_breast_cancer()

In [3]:
def normalize_df(df):
    df = df.apply(lambda x: (x-x.min())/(x.max()-x.min()))
    return df

In [4]:
def remove_outlier(data, target, skip_col=[], n_std=3):
    data_copy = data.copy()
    data_copy['Target'] = target
    for i in range(len(data_copy.columns)-1):
        if i not in skip_col:
            std = data_copy.iloc[:, i].std()
            mean = data_copy.iloc[:, i].mean()
            data_copy = data_copy[-n_std * std + mean < data_copy.iloc[:, i]][data_copy.iloc[:, i] < n_std * std + mean]
        else:
            continue
    target = data_copy[['Target']]
    data_copy = data_copy.drop(columns=['Target'])
    return data_copy, target

In [5]:
wine_df = pd.DataFrame(wine.data)
print(wine_df.shape)
boston_df = pd.DataFrame(boston.data)
print(boston_df.shape)
breast_cancer_df = pd.DataFrame(breast_cancer.data)
print(breast_cancer_df.shape)

(178, 13)
(506, 13)
(569, 30)


wine_df.shape

wine_df['Target'] = wine.target
for i in range(len(wine_df.columns)-1):
    std = wine_df.iloc[:, i].std()
    mean = wine_df.iloc[:, i].mean()
    wine_df = wine_df[-4 * std + mean < wine_df.iloc[:, i]][wine_df.iloc[:, i] < 4 * std + mean]
target = wine_df[['Target']]
wine_df = wine_df.drop(columns=['Target'])
wine_df.shape

In [6]:
wine_normalize_df = normalize_df(wine_df)
print(wine_normalize_df.head())
boston_normalize_df = normalize_df(boston_df)
print(boston_normalize_df.head())
breast_cancer_normalize_df = normalize_df(breast_cancer_df)
print(breast_cancer_normalize_df.head())

         0         1         2         3         4         5         6   \
0  0.842105  0.191700  0.572193  0.257732  0.619565  0.627586  0.573840   
1  0.571053  0.205534  0.417112  0.030928  0.326087  0.575862  0.510549   
2  0.560526  0.320158  0.700535  0.412371  0.336957  0.627586  0.611814   
3  0.878947  0.239130  0.609626  0.319588  0.467391  0.989655  0.664557   
4  0.581579  0.365613  0.807487  0.536082  0.521739  0.627586  0.495781   

         7         8         9         10        11        12  
0  0.283019  0.593060  0.372014  0.455285  0.970696  0.561341  
1  0.245283  0.274448  0.264505  0.463415  0.780220  0.550642  
2  0.320755  0.757098  0.375427  0.447154  0.695971  0.646933  
3  0.207547  0.558360  0.556314  0.308943  0.798535  0.857347  
4  0.490566  0.444795  0.259386  0.455285  0.608059  0.325963  
         0     1         2    3         4         5         6         7   \
0  0.000000  0.18  0.067815  0.0  0.314815  0.577505  0.641607  0.269203   
1  0.000236  

wine_df.shape

In [7]:
wine_target = pd.DataFrame(wine.target)
wine_refine_df, wine_target = remove_outlier(wine_df, wine_target)
wine_refine_df = normalize_df(wine_refine_df)
print(wine_refine_df.shape)

boston_target = pd.DataFrame(boston.target)
boston_refine_df, boston_target = remove_outlier(boston_df, boston_target, skip_col=[3])
boston_refine_df = normalize_df(boston_refine_df)
print(boston_refine_df.shape)

breast_cancer_target = pd.DataFrame(breast_cancer.target)
breast_cancer_refine_df, breast_cancer_target = remove_outlier(breast_cancer_df, breast_cancer_target)
breast_cancer_refine_df = normalize_df(breast_cancer_refine_df)
print(breast_cancer_refine_df.shape)

(168, 13)
(443, 13)

  



(427, 30)


wine_df.shape

In [8]:
def evaluate_data_linear(df_data, target, estimator):
    x_train, x_test, y_train, y_test = train_test_split(df_data, target, test_size=0.1, random_state=4)
    estimator.fit(x_train, y_train)
    y_pred = estimator.predict(x_test)
    
    print(f'mean squared error: {mean_squared_error(y_test, y_pred)}')
    print(f'score: {cross_val_score(estimator, df_data, target, cv=10).mean()}')

In [9]:
def evaluate_data_logistic(df_data, target, estimator):
    x_train, x_test, y_train, y_test = train_test_split(df_data, target, test_size=0.1, random_state=4)
    estimator.fit(x_train, y_train)
    y_pred = estimator.predict(x_test)
    
    print(f'accuracy score: {accuracy_score(y_test, y_pred)}')
    print(f'score: {cross_val_score(estimator, df_data, target, cv=10).mean()}')

In [10]:
lr = linear_model.LinearRegression()
logreg = linear_model.LogisticRegression()

In [11]:
print('original wine data')
evaluate_data_logistic(wine_df, wine.target, logreg)
print('\nnormalized wine data')
evaluate_data_logistic(wine_normalize_df, wine.target, logreg)
print('\nrefined wine data')
evaluate_data_logistic(wine_refine_df, wine_target, logreg)

original wine data
accuracy score: 0.9444444444444444
score: 0.9564327485380117

normalized wine data
accuracy score: 1.0




score: 0.9777433780529755

refined wine data
accuracy score: 1.0
score: 0.987450980392157


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [12]:
print('original boston data')
evaluate_data_linear(boston_df, boston.target, lr)
print('\nnormalized boston data')
evaluate_data_linear(boston_normalize_df, boston.target, lr)
print('\nrefined boston data')
evaluate_data_linear(boston_refine_df, boston_target, lr)

original boston data
mean squared error: 17.038701324921963
score: 0.2025289900605657

normalized boston data
mean squared error: 17.03870132492201
score: 0.20252899006056416

refined boston data
mean squared error: 31.46426219806546
score: 0.3661109893655127


In [13]:
print('original breast_cancer data')
evaluate_data_logistic(breast_cancer_df, breast_cancer.target, logreg)
print('\nnormalized breast_cancer data')
evaluate_data_logistic(breast_cancer_normalize_df, breast_cancer.target, logreg)
print('\nrefined breast_cancer data')
evaluate_data_logistic(breast_cancer_refine_df, breast_cancer_target, logreg)

original breast_cancer data
accuracy score: 0.8771929824561403


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


score: 0.9508998790078644

normalized breast_cancer data
accuracy score: 1.0
score: 0.9649079595540575

refined breast_cancer data
accuracy score: 0.9534883720930233
score: 0.9556830766133091


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
