In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

## Load Data

In [2]:
#Read files into dataframe by using pandas
def open_file(ticker):
    input_dir = r'./'
    try:
        #use pandas to create dataframe
        df = pd.read_csv(os.path.join(input_dir, ticker + '.csv'))
        return df
    
    except Exception as e:
        print(e)
        print('failed to read stock data for ticker: ', ticker)

#Create a Netflix volatility dataframe
NFLX_df = open_file("NFLX_weekly_return_volatility")

In [3]:
#create new dataframes for each of year 2018 and 2019
NFLX_df_train = NFLX_df[NFLX_df['Year'] < 2018]
NFLX_df_train.reset_index(inplace=True)

NFLX_df_test = NFLX_df[NFLX_df['Year'] >= 2018]
NFLX_df_test.reset_index(inplace=True)

In [4]:
NFLX_df_train.head(5)

Unnamed: 0,index,Year,Week_Number,Open_week,Close_week,mean_return,volatility,Labels
0,0,2015,0,49.151428,49.848572,0.0,0.0,red
1,1,2015,1,49.258572,47.041428,-1.122,2.744452,red
2,2,2015,2,47.09,48.191429,0.5128,2.684702,green
3,3,2015,3,48.57143,62.494286,6.8805,7.051198,red
4,4,2015,4,62.57143,63.114285,0.2116,1.867313,green


In [5]:
NFLX_df_test.head(5)

Unnamed: 0,index,Year,Week_Number,Open_week,Close_week,mean_return,volatility,Labels
0,157,2018,0,196.100006,209.990006,2.282,1.842584,green
1,158,2018,1,210.020004,221.229996,1.0562,1.38874,green
2,159,2018,2,224.240006,220.460007,-0.08075,1.29064,red
3,160,2018,3,222.0,274.600006,4.528,3.181522,red
4,161,2018,4,274.200012,267.429993,-0.4982,2.737151,red


In [6]:
NFLX_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        157 non-null    int64  
 1   Year         157 non-null    int64  
 2   Week_Number  157 non-null    int64  
 3   Open_week    157 non-null    float64
 4   Close_week   157 non-null    float64
 5   mean_return  157 non-null    float64
 6   volatility   157 non-null    float64
 7   Labels       157 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 9.9+ KB


In [7]:
NFLX_df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        106 non-null    int64  
 1   Year         106 non-null    int64  
 2   Week_Number  106 non-null    int64  
 3   Open_week    106 non-null    float64
 4   Close_week   106 non-null    float64
 5   mean_return  106 non-null    float64
 6   volatility   106 non-null    float64
 7   Labels       106 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 6.8+ KB


In [8]:
NFLX_df_train.shape

(157, 8)

In [9]:
NFLX_df_test.shape

(106, 8)

In [10]:
#feature scaling
X_train = NFLX_df_train[['mean_return', 'volatility']].values
Y_train = NFLX_df_train['Labels'].values
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

X_test = NFLX_df_test[['mean_return', 'volatility']].values
Y_test = NFLX_df_test['Labels'].values
X_test = scaler.transform(X_test)

## KNN

In [11]:
#initiate model
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train, Y_train)

#accuracy
accuracy_knn = knn_classifier.score(X_test, Y_test)
print('Accuracy by KNN(k=3):', accuracy_knn)

Accuracy by KNN(k=3): 0.9811320754716981


In [12]:
#predict labels and confusion matrix
predicted_knn = knn_classifier.predict(X_test)
confusion_matrix_knn = confusion_matrix(Y_test, predicted_knn)
TPR_knn = confusion_matrix_knn[0][0]/float(np.sum(confusion_matrix_knn[0, :]))
TNR_knn = confusion_matrix_knn[1][1]/float(np.sum(confusion_matrix_knn[1, :]))

print('Confusion Matrix for KNN(k=3):\n', confusion_matrix_knn)
print('True positive rate:', TPR_knn)
print('True negative rate:', TNR_knn)

Confusion Matrix for KNN(k=3):
 [[47  1]
 [ 1 57]]
True positive rate: 0.9791666666666666
True negative rate: 0.9827586206896551


## Logistic Regression

In [13]:
#initiate model
log_reg_classifier = LogisticRegression()
log_reg_classifier.fit(X_train, Y_train)
print('The equation for logistic regression: y =', log_reg_classifier.coef_[0][0], '+', log_reg_classifier.coef_[0][1], '*x1')

#accuracy
accuracy_LR = log_reg_classifier.score(X_test, Y_test)
print('Accuracy by Logistic Regression:', accuracy_LR)

The equation for logistic regression: y = -2.176569371267914 + 1.903141392998164 *x1
Accuracy by Logistic Regression: 0.8490566037735849


In [14]:
#predict labels and confusion matrix
predicted_LR = knn_classifier.predict(X_test)
confusion_matrix_LR = confusion_matrix(Y_test, predicted_LR)
TPR_LR = confusion_matrix_LR[0][0]/float(np.sum(confusion_matrix_LR[0, :]))
TNR_LR = confusion_matrix_LR[1][1]/float(np.sum(confusion_matrix_LR[1, :]))

print('Confusion Matrix for Logistic Regression:\n', confusion_matrix_LR)
print('True positive rate:', TPR_LR)
print('True negative rate:', TNR_LR)

Confusion Matrix for Logistic Regression:
 [[47  1]
 [ 1 57]]
True positive rate: 0.9791666666666666
True negative rate: 0.9827586206896551


## Naive Bayesian

In [15]:
NB_classifier = GaussianNB().fit(X_train, Y_train)

accuracy_NB = NB_classifier.score(X_test, Y_test)
print('Accuracy by Naive Bayesian:', accuracy_NB)

Accuracy by Naive Bayesian: 0.8490566037735849


In [16]:
#predict labels and confusion matrix
predicted_NB = knn_classifier.predict(X_test)
confusion_matrix_NB = confusion_matrix(Y_test, predicted_NB)
TPR_NB = confusion_matrix_NB[0][0]/float(np.sum(confusion_matrix_NB[0, :]))
TNR_NB = confusion_matrix_NB[1][1]/float(np.sum(confusion_matrix_NB[1, :]))

print('Confusion Matrix for Naive Bayesian:\n', confusion_matrix_NB)
print('True positive rate:', TPR_NB)
print('True negative rate:', TNR_NB)

Confusion Matrix for Naive Bayesian:
 [[47  1]
 [ 1 57]]
True positive rate: 0.9791666666666666
True negative rate: 0.9827586206896551


## Decision Tree

In [17]:
tree_classifier = tree.DecisionTreeClassifier(criterion = 'entropy')
tree_classifier = tree_classifier.fit(X_train, Y_train)

accuracy_tree = tree_classifier.score(X_test, Y_test)
print('Accuracy by Decision Tree:', accuracy_tree)

Accuracy by Decision Tree: 1.0


In [18]:
#predict labels and confusion matrix
predicted_tree = knn_classifier.predict(X_test)
confusion_matrix_tree = confusion_matrix(Y_test, predicted_tree)
TPR_tree = confusion_matrix_tree[0][0]/float(np.sum(confusion_matrix_tree[0, :]))
TNR_tree = confusion_matrix_tree[1][1]/float(np.sum(confusion_matrix_tree[1, :]))

print('Confusion Matrix for Decision Tree:\n', confusion_matrix_tree)
print('True positive rate:', TPR_tree)
print('True negative rate:', TNR_tree)

Confusion Matrix for Decision Tree:
 [[47  1]
 [ 1 57]]
True positive rate: 0.9791666666666666
True negative rate: 0.9827586206896551


## Random Forest

In [19]:
RF_classifier = RandomForestClassifier(n_estimators=6, max_depth=5, criterion='entropy')
RF_classifier.fit(X_train, Y_train)

accuracy_RF = RF_classifier.score(X_test, Y_test)
print('Accuracy by Random Forest:', accuracy_RF)

Accuracy by Random Forest: 1.0


In [20]:
#predict labels and confusion matrix
predicted_RF = knn_classifier.predict(X_test)
confusion_matrix_RF = confusion_matrix(Y_test, predicted_RF)
TPR_RF = confusion_matrix_RF[0][0]/float(np.sum(confusion_matrix_RF[0, :]))
TNR_RF = confusion_matrix_RF[1][1]/float(np.sum(confusion_matrix_RF[1, :]))

print('Confusion Matrix for Random Forest:\n', confusion_matrix_RF)
print('True positive rate:', TPR_RF)
print('True negative rate:', TNR_RF)

Confusion Matrix for Random Forest:
 [[47  1]
 [ 1 57]]
True positive rate: 0.9791666666666666
True negative rate: 0.9827586206896551
