# Week 1 LAB

출처 : https://github.com/a-martyn/ISL-python

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn import datasets
from IPython.display import display, HTML

# 1. Importing Dataset

In [2]:
# Load data
weekly_df = pd.read_csv('./data/Weekly.csv')

# Check for missing data
assert weekly_df.isnull().sum().sum() == 0

# Pre-processing
weekly_df = pd.get_dummies(weekly_df).drop('Direction_Down', axis=1)
weekly_df.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction_Up
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,0
1,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,0
2,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,1
3,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,1
4,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,1


# 2. Making Scores

In [3]:
def confusion_table(confusion_mtx):
    """Renders a nice confusion table with labels"""
    confusion_df = pd.DataFrame({'y_pred=0': np.append(confusion_mtx[:, 0], confusion_mtx.sum(axis=0)[0]),
                                 'y_pred=1': np.append(confusion_mtx[:, 1], confusion_mtx.sum(axis=0)[1]),
                                 'Total': np.append(confusion_mtx.sum(axis=1), ''),
                                 '': ['y=0', 'y=1', 'Total']}).set_index('')
    return confusion_df


def positive_observations(y):
    # What percentage of observations are positive?
    proportion_1 = ((y == 1).sum() / len(y))
    pct_1        = np.around(proportion_1*100, decimals=3)
    display(HTML('<p><h4>{}%</h4>of observations are positive</p>'.format(pct_1)))


# Classifier stats
# -------------------------------------------------

def prior_error_rate(confusion_matrix):
    """The prior probability that a result is positive"""
    return 1 - (np.sum(confusion_mtx[1, :]) / np.sum(confusion_mtx))

def total_error_rate(confusion_matrix):
    """Derive total error rate from confusion matrix"""
    return 1 - np.trace(confusion_mtx) / np.sum(confusion_mtx)

def true_positive_rate(confusion_mtx):
    """or sensitivity: the proportion of actual POSITIVES that are correctly identified as such"""
    return confusion_mtx[1, 1] / np.sum(confusion_mtx[1, :])

def false_negative_rate(confusion_mtx):
    """the proportion of actual POSITIVES that are incorrectly identified as negative"""
    return confusion_mtx[1, 0] / np.sum(confusion_mtx[1, :])

def false_positive_rate(confusion_mtx):
    """the proportion of actual NEGATIVES that are incorrectly identified as positives"""
    return confusion_mtx[0, 1] / np.sum(confusion_mtx[0, :])

def true_negative_rate(confusion_mtx):
    """or specificity: the proportion of actual NEGATIVES that are correctly identified as such"""
    return confusion_mtx[0, 0] / np.sum(confusion_mtx[0, :])

def positive_predictive_value(confusion_mtx):
    """or precision: the proportion of predicted positives that are correctly predicted"""
    return confusion_mtx[1, 1] / np.sum(confusion_mtx[:, 1])

def negative_predictive_value(confusion_mtx):
    """the proportion of predicted negatives that are correctly predicted"""
    return confusion_mtx[0, 0] / np.sum(confusion_mtx[:, 0])

def classifier_stats(confusion_mtx):
    return pd.Series({'prior_error_rate': prior_error_rate(confusion_mtx),
                      'total_error_rate': total_error_rate(confusion_mtx),
                      'true_positive_rate (sensitivity)': true_positive_rate(confusion_mtx),
                      'false_negative_rate': false_negative_rate(confusion_mtx),
                      'false_positive_rate': false_positive_rate(confusion_mtx),
                      'true_negative_rate (specificity)': true_negative_rate(confusion_mtx),
                      'positive_predictive_value (precision)': positive_predictive_value(confusion_mtx),
                      'negative_predictive_value': negative_predictive_value(confusion_mtx)})


# Example) Logistic 

In [4]:
# Create hold-out sets
train_idx       = weekly_df.index[weekly_df['Year'] < 2009]
weekly_df_train = weekly_df.iloc[train_idx]
weekly_df_test  = weekly_df.drop(train_idx)
y_test          = weekly_df_test['Direction_Up']


# What percentage of observations are in the direction_up class?
positive_observations(weekly_df_test['Direction_Up'])

# Fit model
f_10d     = 'Direction_Up ~ Lag2'
model_10d = smf.logit(formula=f_10d, data=weekly_df_train).fit()

# Get confusion matrix using test data
threshold     = 0.5
y_pred_logit   = (model_10d.predict(weekly_df_test) > threshold).astype(int)
confusion_mtx = confusion_matrix(y_test, y_pred_logit)

# Render as nice table
display(confusion_table(confusion_mtx))

# Classifier stats
logit_stats = classifier_stats(confusion_mtx)
logit_stats

Optimization terminated successfully.
         Current function value: 0.685555
         Iterations 4


Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,9.0,34.0,43.0
y=1,5.0,56.0,61.0
Total,14.0,90.0,


prior_error_rate                         0.413462
total_error_rate                         0.375000
true_positive_rate (sensitivity)         0.918033
false_negative_rate                      0.081967
false_positive_rate                      0.790698
true_negative_rate (specificity)         0.209302
positive_predictive_value (precision)    0.622222
negative_predictive_value                0.642857
dtype: float64

In [5]:
weekly_df_test

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction_Up
985,2009,6.760,-1.698,0.926,0.418,-2.251,3.793110,-4.448,0
986,2009,-4.448,6.760,-1.698,0.926,0.418,5.043904,-4.518,0
987,2009,-4.518,-4.448,6.760,-1.698,0.926,5.948758,-2.137,0
988,2009,-2.137,-4.518,-4.448,6.760,-1.698,6.129763,-0.730,0
989,2009,-0.730,-2.137,-4.518,-4.448,6.760,5.602004,5.173,1
...,...,...,...,...,...,...,...,...,...
1084,2010,-0.861,0.043,-2.173,3.599,0.015,3.205160,2.969,1
1085,2010,2.969,-0.861,0.043,-2.173,3.599,4.242568,1.281,1
1086,2010,1.281,2.969,-0.861,0.043,-2.173,4.835082,0.283,1
1087,2010,0.283,1.281,2.969,-0.861,0.043,4.454044,1.034,1


In [6]:
weekly_df_train

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction_Up
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.270,0
1,1990,-0.270,0.816,1.572,-3.936,-0.229,0.148574,-2.576,0
2,1990,-2.576,-0.270,0.816,1.572,-3.936,0.159837,3.514,1
3,1990,3.514,-2.576,-0.270,0.816,1.572,0.161630,0.712,1
4,1990,0.712,3.514,-2.576,-0.270,0.816,0.153728,1.178,1
...,...,...,...,...,...,...,...,...,...
980,2008,12.026,-8.389,-6.198,-3.898,10.491,5.841565,-2.251,0
981,2008,-2.251,12.026,-8.389,-6.198,-3.898,6.093950,0.418,1
982,2008,0.418,-2.251,12.026,-8.389,-6.198,5.932454,0.926,1
983,2008,0.926,0.418,-2.251,12.026,-8.389,5.855972,-1.698,0


# 1. Do it by LDA

# Fit model
lda = LinearDiscriminantAnalysis()

model_10e = lda.fit(X_train, y_train)

In [7]:
import pandas as pd

In [8]:
train_idx  = weekly_df.index[weekly_df['Year'] < 2009]
train = weekly_df.iloc[train_idx]
test = weekly_df.drop(train_idx)

In [9]:
X_train = np.array(weekly_df_train['Lag2']).reshape(-1,1)
y_train = np.array(weekly_df_train['Direction_Up'])
X_test  = np.array(weekly_df_test['Lag2']).reshape(-1,1)
y_test  = np.array(weekly_df_test['Direction_Up'])

In [10]:
lda = LinearDiscriminantAnalysis()
model = lda.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

In [12]:
confusion_mat = confusion_matrix(y_test,y_pred)
display(confusion_table(confusion_mat))

Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,9.0,34.0,43.0
y=1,5.0,56.0,61.0
Total,14.0,90.0,


In [13]:
correct_rate = 1 - total_error_rate(confusion_mtx)
display(HTML('<p><h4>{}%</h4> of test predictions correct</p>'.format(np.round(correct_rate*100, decimals=3))))

In [14]:
lda_stats = classifier_stats(confusion_mtx)
lda_stats

prior_error_rate                         0.413462
total_error_rate                         0.375000
true_positive_rate (sensitivity)         0.918033
false_negative_rate                      0.081967
false_positive_rate                      0.790698
true_negative_rate (specificity)         0.209302
positive_predictive_value (precision)    0.622222
negative_predictive_value                0.642857
dtype: float64

# 2. by QDA

# Fit model
qda = QuadraticDiscriminantAnalysis()

model_10f = qda.fit(X_train, y_train)

In [15]:
qda = QuadraticDiscriminantAnalysis()
model = qda.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
confusion_mat = confusion_matrix(y_test,y_pred)
display(confusion_table(confusion_mat))

Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,0.0,43.0,43.0
y=1,0.0,61.0,61.0
Total,0.0,104.0,


In [18]:
correct_rate = 1 - total_error_rate(confusion_mtx)
display(HTML('<p><h4>{}%</h4> of test predictions correct</p>'.format(np.round(correct_rate*100, decimals=3))))

In [19]:
lda_stats = classifier_stats(confusion_mtx)
lda_stats

prior_error_rate                         0.413462
total_error_rate                         0.375000
true_positive_rate (sensitivity)         0.918033
false_negative_rate                      0.081967
false_positive_rate                      0.790698
true_negative_rate (specificity)         0.209302
positive_predictive_value (precision)    0.622222
negative_predictive_value                0.642857
dtype: float64

# 3. by KNN

# Fit Model

K = 1
model_10g = KNeighborsClassifier(n_neighbors=K).fit(X_train, y_train)

y_pred_knn = model_10g.predict(X_test)

In [20]:
model = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)

In [22]:
confusion_mat = confusion_matrix(y_test,y_pred)
display(confusion_table(confusion_mat))

Unnamed: 0,y_pred=0,y_pred=1,Total
,,,
y=0,16.0,27.0,43.0
y=1,22.0,39.0,61.0
Total,38.0,66.0,


In [23]:
correct_rate = 1 - total_error_rate(confusion_mtx)
display(HTML('<p><h4>{}%</h4> of test predictions correct</p>'.format(np.round(correct_rate*100, decimals=3))))

In [24]:
lda_stats = classifier_stats(confusion_mtx)
lda_stats

prior_error_rate                         0.413462
total_error_rate                         0.375000
true_positive_rate (sensitivity)         0.918033
false_negative_rate                      0.081967
false_positive_rate                      0.790698
true_negative_rate (specificity)         0.209302
positive_predictive_value (precision)    0.622222
negative_predictive_value                0.642857
dtype: float64

In [29]:
np.array(weekly_df_train['Lag2']).reshape(-1,1)

array([[ 1.5720e+00],
       [ 8.1600e-01],
       [-2.7000e-01],
       [-2.5760e+00],
       [ 3.5140e+00],
       [ 7.1200e-01],
       [ 1.1780e+00],
       [-1.3720e+00],
       [ 8.0700e-01],
       [ 4.1000e-02],
       [ 1.2530e+00],
       [-2.6780e+00],
       [-1.7930e+00],
       [ 2.8200e+00],
       [ 4.0220e+00],
       [ 7.5000e-01],
       [-1.7000e-02],
       [ 2.4200e+00],
       [-1.2250e+00],
       [ 1.1710e+00],
       [-2.0610e+00],
       [ 7.2900e-01],
       [ 1.1200e-01],
       [ 2.4800e+00],
       [-1.5520e+00],
       [-2.2590e+00],
       [-2.4280e+00],
       [-2.7080e+00],
       [-2.2920e+00],
       [-4.9780e+00],
       [ 3.5470e+00],
       [ 2.6000e-01],
       [-2.0320e+00],
       [-1.7390e+00],
       [-1.6930e+00],
       [ 1.7810e+00],
       [-3.6820e+00],
       [ 4.1500e+00],
       [-2.4870e+00],
       [ 2.3430e+00],
       [ 6.0600e-01],
       [ 1.0770e+00],
       [-6.3700e-01],
       [ 2.2600e+00],
       [ 1.7160e+00],
       [-2