## Exercises

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_curve, auc
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

### Question 10

#### 10.a

In [20]:
weekly = pd.read_csv('Data/Weekly.csv', index_col=0, parse_dates=True)
weekly['Direction2'] = weekly.Direction.factorize()[0]
weekly.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction,Direction2
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990-01-01,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,Down,0
1990-01-01,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down,0
1990-01-01,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,Up,1
1990-01-01,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,Up,1
1990-01-01,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,Up,1


In [21]:
weekly.describe()

Unnamed: 0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction2
count,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0,1089.0
mean,0.150585,0.151079,0.147205,0.145818,0.139893,1.574618,0.149899,0.555556
std,2.357013,2.357254,2.360502,2.360279,2.361285,1.686636,2.356927,0.497132
min,-18.195,-18.195,-18.195,-18.195,-18.195,0.087465,-18.195,0.0
25%,-1.154,-1.154,-1.158,-1.158,-1.166,0.332022,-1.154,0.0
50%,0.241,0.241,0.241,0.238,0.234,1.00268,0.241,1.0
75%,1.405,1.409,1.409,1.409,1.405,2.053727,1.405,1.0
max,12.026,12.026,12.026,12.026,12.026,9.328214,12.026,1.0


#### 10.b

In [22]:
X = weekly[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
y = weekly.Direction2

regr = skl_lm.LogisticRegression()
regr.fit(X, y)
print('classes:', regr.classes_)
print('coefficient:', regr.coef_)
print('intercept term:', regr.intercept_)

classes: [0 1]
coefficient: [[-0.04117292  0.05846974 -0.01599122 -0.02769998 -0.01440289 -0.02212844]]
intercept term: [ 0.26484745]


In [23]:
X = sm.add_constant(X)
est = smf.Logit(y.ravel(), X).fit()
est.summary().tables[1]

Optimization terminated successfully.
         Current function value: 0.682441
         Iterations 4


0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2669,0.086,3.106,0.002,0.098,0.435
Lag1,-0.0413,0.026,-1.563,0.118,-0.093,0.010
Lag2,0.0584,0.027,2.175,0.030,0.006,0.111
Lag3,-0.0161,0.027,-0.602,0.547,-0.068,0.036
Lag4,-0.0278,0.026,-1.050,0.294,-0.080,0.024
Lag5,-0.0145,0.026,-0.549,0.583,-0.066,0.037
Volume,-0.0227,0.037,-0.616,0.538,-0.095,0.050


#### 10.c

In [24]:
est.pred_table().T

array([[  54.,   48.],
       [ 430.,  557.]])

Here, we see that when our model predicted the market to up, it often did, however, it was very inaccurate when it came to predicting if the market would go down.

#### 10.d

In [35]:
X_train = weekly[:'2008'][['Lag2']]
y_train = weekly[:'2008'][['Direction2']]

X_test = weekly['2009':][['Lag2']]
y_test = weekly['2009':][['Direction2']] 

In [43]:
regr = skl_lm.LogisticRegression()
regr.fit(X_train, np.asarray(y_train).ravel())
pred = regr.predict(X_test)

In [47]:
cm_df = pd.DataFrame(confusion_matrix(y_test, pred).T, index=regr.classes_, columns=regr.classes_)
print(cm_df)

    0   1
0   9   5
1  34  56


In [48]:
print(classification_report(y_test, pred, digits=3))

             precision    recall  f1-score   support

          0      0.643     0.209     0.316        43
          1      0.622     0.918     0.742        61

avg / total      0.631     0.625     0.566       104



### 10.e

In [52]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, np.asarray(y_train).ravel()) # have to go through these conversion calesthenics to get rid of the error
pred = lda.predict(X_test)

cm_df = pd.DataFrame(confusion_matrix(y_test, pred).T, index=lda.classes_, columns=lda.classes_)
print(cm_df) # exactly the same as logistic regression

    0   1
0   9   5
1  34  56


### 10.f

In [53]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, np.asarray(y_train).ravel()) 
pred = qda.predict(X_test)

cm_df = pd.DataFrame(confusion_matrix(y_test, pred).T, index=qda.classes_, columns=qda.classes_)
print(cm_df)

    0   1
0   0   0
1  43  61


### 10.g

In [55]:
clf = neighbors.KNeighborsClassifier(n_neighbors=1, weights='uniform')
clf.fit(X_train, np.asarray(y_train).ravel()) 
pred = clf.predict(X_test)

cm_df = pd.DataFrame(confusion_matrix(y_test, pred).T, index=clf.classes_, columns=clf.classes_)
print(cm_df)

    0   1
0  21  31
1  22  30


### 10.h

LDA and Logistic Regression seem to be best.