# Import libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

from binance.client import Client

# Connect to API

In [2]:
client = Client('key',
                'secret-key')

# Get data

In [3]:
ethbtc_klines = client.get_historical_klines('ETHBTC', Client.KLINE_INTERVAL_1HOUR, "100 days ago UTC")

# Prepare Dataset

In [4]:
kline_columns = ['OpenTime', 'Open', 'High', 'Low', 'Close',
                 'Volume', 'CloseTime', 'QuoteAssetVolume',
                 'NumberOfTrades', 'TakerBuyBaseAssetVolume',
                 'TakerBuyQuoteAssetVolume', 'Ignore']

In [5]:
ethbtc_klines_df = pd.DataFrame(ethbtc_klines, columns=kline_columns)

In [6]:
ethbtc_klines_df.dtypes

OpenTime                     int64
Open                        object
High                        object
Low                         object
Close                       object
Volume                      object
CloseTime                    int64
QuoteAssetVolume            object
NumberOfTrades               int64
TakerBuyBaseAssetVolume     object
TakerBuyQuoteAssetVolume    object
Ignore                      object
dtype: object

In [7]:
ethbtc_klines_df['OpenTime'] = pd.to_datetime(ethbtc_klines_df['OpenTime'], unit='ms')
ethbtc_klines_df['CloseTime'] = pd.to_datetime(ethbtc_klines_df['CloseTime'], unit='ms')

In [8]:
float_columns = ['Open', 'High', 'Low', 'Close',
                 'Volume', 'QuoteAssetVolume',
                 'NumberOfTrades', 'TakerBuyBaseAssetVolume',
                 'TakerBuyQuoteAssetVolume']

In [9]:
ethbtc_klines_df[float_columns] = ethbtc_klines_df[float_columns].astype('float64')

In [10]:
ethbtc_klines_df.head()

Unnamed: 0,OpenTime,Open,High,Low,Close,Volume,CloseTime,QuoteAssetVolume,NumberOfTrades,TakerBuyBaseAssetVolume,TakerBuyQuoteAssetVolume,Ignore
0,2018-05-16 19:00:00,0.084118,0.084399,0.084028,0.084323,2882.2,2018-05-16 19:59:59.999,242.604719,5255.0,1270.387,106.970321,0
1,2018-05-16 20:00:00,0.084322,0.084404,0.083901,0.083901,2903.853,2018-05-16 20:59:59.999,244.393919,4688.0,1567.074,131.895779,0
2,2018-05-16 21:00:00,0.084037,0.084178,0.08371,0.084084,2302.676,2018-05-16 21:59:59.999,193.222431,4139.0,1163.971,97.700989,0
3,2018-05-16 22:00:00,0.084038,0.084598,0.084036,0.084486,3043.545,2018-05-16 22:59:59.999,256.873609,4436.0,1534.161,129.476276,0
4,2018-05-16 23:00:00,0.084447,0.084731,0.08442,0.084699,3120.865,2018-05-16 23:59:59.999,264.015679,4113.0,1398.417,118.331313,0


In [11]:
prepared_dataset = ethbtc_klines_df.drop(['OpenTime', 'CloseTime', 'Ignore'], axis=1)

In [12]:
prepared_dataset.head()

Unnamed: 0,Open,High,Low,Close,Volume,QuoteAssetVolume,NumberOfTrades,TakerBuyBaseAssetVolume,TakerBuyQuoteAssetVolume
0,0.084118,0.084399,0.084028,0.084323,2882.2,242.604719,5255.0,1270.387,106.970321
1,0.084322,0.084404,0.083901,0.083901,2903.853,244.393919,4688.0,1567.074,131.895779
2,0.084037,0.084178,0.08371,0.084084,2302.676,193.222431,4139.0,1163.971,97.700989
3,0.084038,0.084598,0.084036,0.084486,3043.545,256.873609,4436.0,1534.161,129.476276
4,0.084447,0.084731,0.08442,0.084699,3120.865,264.015679,4113.0,1398.417,118.331313


# Prepare Labels

In [13]:
prepared_labels = prepared_dataset['Close'] < prepared_dataset['Close'].shift(-1)

In [14]:
prepared_dataset.drop(prepared_dataset.index[-1], inplace=True)

In [15]:
prepared_labels.drop(prepared_labels.index[-1], inplace=True)

In [16]:
prepared_labels.describe()

count      2391
unique        2
top       False
freq       1264
Name: Close, dtype: object

In [17]:
print(prepared_dataset.shape, prepared_labels.shape)

(2391, 9) (2391,)


# Split Data

In [18]:
test_size = int(prepared_dataset.shape[0] * .8)

In [19]:
print(test_size)

1912


In [20]:
X_train, X_test = prepared_dataset.iloc[:test_size], prepared_dataset.iloc[test_size:]
y_train, y_test = prepared_labels.iloc[:test_size], prepared_labels.iloc[test_size:]

In [21]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1912, 9) (479, 9) (1912,) (479,)


# Scale Train and Test Data

In [22]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Logistic Regression Classifier

In [23]:
clf_logr = LogisticRegression()
clf_logr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
score = clf_logr.score(X_test, y_test)
print(score)

0.5031315240083507


In [25]:
y_test.describe()

count       479
unique        2
top       False
freq        262
Name: Close, dtype: object

# Grab most recent data from Binance

In [26]:
ethbtc_klines_new_data = client.get_historical_klines('ETHBTC', Client.KLINE_INTERVAL_1HOUR, "1 HOUR ago UTC")

In [27]:
ethbtc_klines_new_data_df = pd.DataFrame(ethbtc_klines_new_data, columns=kline_columns)

In [28]:
ethbtc_klines_new_data_df[float_columns] = ethbtc_klines_new_data_df[float_columns].astype('float64')
prepared_dataset_real_time = ethbtc_klines_new_data_df.drop(['OpenTime', 'CloseTime', 'Ignore'], axis=1)
eth_real_time_data = scaler.transform(prepared_dataset_real_time)

# Make Prediction in Real Time

In [29]:
predictions = clf_logr.predict_proba(eth_real_time_data)

# Get Probability of Loss/Gain

In [30]:
print(predictions)

[[0.46529981 0.53470019]]


In [36]:
most_frequent = y_test.describe()['freq'] / y_test.describe()['count']
print(y_test.describe()['top'], "is most frequent at", format(most_frequent * 100, '.2f'), "%")

False is most frequent at 54.70 %


In [38]:
score_vs_frequent = score - most_frequent

In [39]:
score_vs_frequent

-0.04384133611691021