In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

# class to encode multiple columns
class MultiColumnLabelEncoder:
    def __init__(self, columns = None):
        # array of column names to encode
        self.columns = columns
    def fit(self, X, y = None):
        # not relevant here
        return self
    def transform(self, X):
        # transforms columns of X specified in self.columns using LabelEncoder()
        # if no columns specified, transforms all columns in X
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = preprocessing.LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = preprocessing.LabelEncoder().fit_transform(col)
        return output
    def fit_transform(self, X, y = None):
        return self.fit(X, y).transform(X)

# define column names
names = ["Date", "Open", "High", "Low", "Close", "Volume (BTC)", "Volume (Currency)", "Weighted Price"]

# specify filename
file = "bitcoin.csv"

# load training data
df = pd.read_csv(file, header = 0, names = names[0:8])
df.head()
bitcoin_df = pd.DataFrame(df)

# preprocess weighted price
classifications = []
def preprocess(threshold):
    for price in bitcoin_df["Weighted Price"]:
        classifications.append(str(round(price / threshold)))

preprocess(100)
classifications = pd.DataFrame(classifications)
bitcoin_df["Classifications"] = classifications
transformed = MultiColumnLabelEncoder(columns = ["Date"]).fit_transform(bitcoin_df)

# array for making predictions
predictions = []

# create design matrix X and vector y
X = np.array(transformed.ix[ : ,  :5])
y = np.array(transformed["Classifications"])

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

# create odd list of K for KNN
myList = list(range(1, 50))

# subset just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, myList))

# empty list that will hold cross validation scores
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X_train, y_train, cv = 10, scoring = "accuracy")
    cv_scores.append(scores.mean())

# change to misclassification error
MSE = [1 - x for x in cv_scores]

# discover what is the best value of k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)

def train(X_train, y_train):
    # do nothing
    return

def predict(X_train, y_train, x_test, k):
    # create list for distances and targets
    distances = []
    targets = []
    for i in range(len(X_train)):
        # compute the euclidean distance
        distance = np.sqrt(np.sum(np.square(x_test - X_train[i,  : ])))
        # add it to list of distances
        distances.append([distance, i])
    # sort the list
    distances = sorted(distances)
    # make a list of the k neighbors' targets
    for i in range(k):
        index = distances[i][1]
        targets.append(y_train[index])
    # return most common target
    return Counter(targets).most_common(1)[0][0]

def kNearestNeighbor(X_train, y_train, X_test, predictions, k):
    # train on the input data
    train(X_train, y_train)
    # loop over all observations
    for i in range(len(X_test)):
        predictions.append(predict(X_train, y_train, X_test[i,  : ], k))

# make predictions using the optimal value of K discovered above
predictions = []
try:
    optimalK = optimal_k
    kNearestNeighbor(X_train, y_train, X_test, predictions, optimalK)
    predictions = np.asarray(predictions)
    # evaluate accuracy
    accuracy = accuracy_score(y_test, predictions) * 100
    print ("\nThe accuracy of the classifier is %d%%" % accuracy)
    confusionMatrix = confusion_matrix(y_test, predictions)
    print (confusionMatrix)

except ValueError:
    # the value of k should be changed
    print ("Can't have more neighbors than training samples!")

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


The optimal number of neighbors is 1

The accuracy of the classifier is 90%
[[12  2  0 ...,  0  0  0]
 [ 0  5  0 ...,  0  0  0]
 [ 0  0 10 ...,  0  0  0]
 ..., 
 [ 0  0  0 ..., 35  1  0]
 [ 0  0  0 ...,  0 22  0]
 [ 2  0  0 ...,  0  2 10]]


In [71]:
from id3 import Id3Estimator
from id3 import export_graphviz
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import scipy.stats as st
from os import system

# class to encode multiple columns
class MultiColumnLabelEncoder:
    def __init__(self, columns = None):
        # array of column names to encode
        self.columns = columns
    def fit(self, X, y = None):
        # not relevant here
        return self
    def transform(self, X):
        # transforms columns of X specified in self.columns using LabelEncoder()
        # if no columns specified, transforms all columns in X
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = preprocessing.LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = preprocessing.LabelEncoder().fit_transform(col)
        return output
    def fit_transform(self, X, y = None):
        return self.fit(X, y).transform(X)

# define column names
names = ["Date", "Open", "High", "Low", "Close", "Volume (BTC)", "Volume (Currency)", "Weighted Price"]

# specify filename
file = "bitcoin.csv"

# load data
bitcoin_data = pd.read_csv(file, header = 0, names = names[0:8])

# wrap in dataframe
bitcoin_df = pd.DataFrame(bitcoin_data)

# preprocess weighted price
classifications = []
def preprocess(threshold):
    for price in bitcoin_df["Weighted Price"]:
        classifications.append(str(round(price / threshold)))

preprocess(100)
classifications = pd.DataFrame(classifications)
bitcoin_df["Classifications"] = classifications

# create transformed data frame
transformed = MultiColumnLabelEncoder(columns = ["Date"]).fit_transform(bitcoin_df)

# slice the transformed dataframe into an array
data = np.array(transformed.ix[ : , :5])
print (str(transformed))
print (str(np.array(transformed.ix[:,1:5])))

# identify the target
target = np.array(transformed["Classifications"])

estimator = Id3Estimator()
estimator.fit(data, target)
export_graphviz(estimator.tree_, "bitcoin_tree.dot", names)
# dot -Tpng bitcoin_tree.dot -o bitcoin_tree.png

confusionMatrix = confusion_matrix(target, estimator.predict(data))
i = 0
j = 0
correct = 0
total = 0
while (i < len(confusionMatrix)):
    while (j < len(confusionMatrix[i])):
        if (i == j):
            correct += confusionMatrix[i][j]
        total += confusionMatrix[i][j]
        j += 1
    i += 1
accuracy = (correct / total) * 100
print ("\nThe accuracy of the classifier is %d%%" % accuracy)
print (confusionMatrix)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


      Date        Open        High         Low       Close  Volume (BTC)  \
0     1333  5590.00000  5731.30000  5534.00000  5718.50000   3373.595949   
1     1332  5606.50000  5630.00000  5132.30000  5579.40000   6192.789612   
2     1331  5738.70000  5759.90000  5535.10000  5606.50000   3462.553479   
3     1330  5673.20000  5776.60000  5550.00000  5738.80000   3134.620657   
4     1329  5800.00000  5840.40000  5462.10000  5680.00000   4536.386215   
5     1328  5626.70000  5800.00000  5556.10000  5800.00000   3019.946476   
6     1327  5429.80000  5854.40000  5380.10000  5640.00000   9222.144339   
7     1326  4824.20000  5430.00000  4812.20000  5429.70000   7950.666610   
8     1325  4745.40000  4878.00000  4710.20000  4822.60000   3619.274645   
9     1324  4774.90000  4919.00000  4712.40000  4748.50000   5679.179929   
10    1323  4594.40000  4875.00000  4555.00000  4772.10000   5241.182325   
11    1322  4423.20000  4615.10000  4407.10000  4592.00000   4105.517197   
12    1321  


The accuracy of the classifier is 97%
[[35  0  0 ...,  0  0  1]
 [15  0  1 ...,  0  0  0]
 [ 0  0 35 ...,  0  0  0]
 ..., 
 [ 0  0  0 ..., 91  5  0]
 [ 0  0  0 ...,  0 57  0]
 [ 1  0  0 ...,  0  0 27]]
