# Sklearn

## sklearn.tree

документация: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree

примеры: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree

In [1]:
from matplotlib.colors import ListedColormap
from sklearn import cross_validation, datasets, metrics, tree ,ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np




In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### Разделяющая поверхность

In [None]:
def get_meshgrid(data, step=.05, border=.5,):
    x_min, x_max = data[:, 0].min() - border, data[:, 0].max() + border
    y_min, y_max = data[:, 1].min() - border, data[:, 1].max() + border
    return np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step))

In [None]:
def plot_decision_surface(estimator, train_data, train_labels, test_data, test_labels, 
                          colors = colors, light_colors = light_colors):
    #fit model
    estimator.fit(train_data, train_labels)
    
    #set figure size
    pyplot.figure(figsize = (16, 6))
    
    #plot decision surface on the train data 
    pyplot.subplot(1,2,1)
    xx, yy = get_meshgrid(train_data)
    mesh_predictions = np.array(estimator.predict(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape)
    pyplot.pcolormesh(xx, yy, mesh_predictions, cmap = light_colors)
    pyplot.scatter(train_data[:, 0], train_data[:, 1], c = train_labels, s = 100, cmap = colors)
    pyplot.title('Train data, accuracy={:.2f}'.format(metrics.accuracy_score(train_labels, estimator.predict(train_data))))
    
    #plot decision surface on the test data
    pyplot.subplot(1,2,2)
    pyplot.pcolormesh(xx, yy, mesh_predictions, cmap = light_colors)
    pyplot.scatter(test_data[:, 0], test_data[:, 1], c = test_labels, s = 100, cmap = colors)
    pyplot.title('Test data, accuracy={:.2f}'.format(metrics.accuracy_score(test_labels, estimator.predict(test_data))))

In [10]:
datas = pd.read_csv('XBTUSD_20170101_20180226.csv')
# beginnig in 20171215 8: 56 :00
datas = datas.drop(['<TICKER>','<PER>'], axis =1)
#pyplot.plot(np.linspace(0,100,106000),datas['<OPEN>'])
#datas = datas.drop(range(0,5000), axis =0)
datas.head()


Unnamed: 0,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOLUME>
0,20170101,30000,968.29,968.29,968.29,968.29,0.0
1,20170101,30100,968.29,968.76,968.49,968.70,12993.0
2,20170101,30200,968.70,968.70,967.20,968.43,73800.0
3,20170101,30300,968.43,968.00,967.21,967.21,3500.0
4,20170101,30400,967.21,967.21,966.74,966.97,15969.0
5,20170101,30500,966.97,966.97,966.97,966.97,300.0
6,20170101,30600,966.97,967.00,967.00,967.00,13231.0
7,20170101,30700,967.00,966.89,966.89,966.89,500.0
8,20170101,30800,966.89,966.89,966.89,966.89,0.0
9,20170101,30900,966.89,966.89,966.89,966.89,0.0


In [76]:
# проценты от средней
def mean_period(datas, pediod):
    averages = []
    for i in np.arange(period,datas.shape[0],period):
        averages.append(datas['<OPEN>'][i-period:i].mean())
    averages.append(datas['<OPEN>'][i:datas.shape[0]].mean())
    return averages
    
def make_serias(datas,period,bound,averages):
    current_price = datas['<OPEN>'][0]
    result_serias = [] 
    #print(averages)
    
    for i in range(datas.shape[0]):
        j = i / period
        #print(j)
        if current_price+averages[j]*bound/100.0<datas['<OPEN>'][i]:
            current_price = current_price + averages[j]*bound/100.0
            result_serias.append(-1)
            
        elif current_price - averages[j]*bound/100.0>datas['<OPEN>'][i]:
            current_price = current_price - averages[j]*bound/100.0
            result_serias.append(1)
    return result_serias   



def prob_a_z(serias):
    a= 0 
    z =0 
    for i in range(len(serias)):
        if serias[i] <0:
            a = a+1
        if serias[i]>0:
            z = z+1
    return {'success' : a,
            'failure' : z,
            'prob of success': float(a)/(a+z)}

def make_past(serias,len_of_past):
    data = []
    y = []
    for i in range(len_of_past,len(serias)):
        data.append(serias[i - len_of_past:i])
        y.append(serias[i])
    return [np.array(data),np.array(y)]
        

In [77]:
bound = 5.0
period = 60
rounding = 1.0
len_of_past = 10 
serias = make_serias(datas,period,bound,mean_period(datas,period))
prob_a_z(serias)
data = make_past(serias,len_of_past)
X = data[0]
y = data[1]

In [None]:
#подготовка данных для обучения
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(X, 
                                                                                     y, test_size = 0.3,
                                                                                     random_state = 0)

# Решающее дерево

In [100]:
# обучение дерева :

tree_class = tree.DecisionTreeClassifier(class_weight='balanced')
tree_class.fit(train_data,train_labels)



DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [101]:
tree_class.feature_importances_
print metrics.accuracy_score(test_labels,tree_class.predict(test_data))
print metrics.precision_score(test_labels,tree_class.predict(test_data))
print metrics.recall_score(test_labels,tree_class.predict(test_data))


0.542372881356
0.489130434783
0.569620253165


In [93]:
print test_labels[0:20]
print tree_class.predict(test_data)[0:20]


[ 1 -1 -1  1 -1 -1  1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1 -1 -1]
[-1 -1  1 -1 -1 -1  1 -1  1  1 -1 -1 -1 -1  1  1  1 -1 -1  1]


In [None]:
len_of_past = 100

y = []
X = [] 

i =5000 + len_of_past
while i < datas.shape[0] :
    y.append(datas['<OPEN>'][i])
    X.append(list(datas['<OPEN>'][i-len_of_past-5000:i-5000-3])+list([np.array(datas['<VOLUME>'][i-len_of_past-5000:i-5000-10]).mean()]))
    i+=1
    
X_all = np.array(X)
y_all = np.array(y)

#i = i =5000 + len_of_past
#while i < datas.shape[0] :
    #y.append(datas['<OPEN>'][i])
    #X.append(list(datas['<OPEN>'][i-len_of_past-5000:i-5000-3])+list([np.array(datas['<VOLUME>'][i-len_of_past-5000:i-5000-10]).mean()]))
    #i+=1

X


In [851]:
list([5])

[5]

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
train_data, test_data,y_train,y_test = train_test_split(X_all,y_all, test_size = 0.3,random_state =0)

In [None]:
#обученное дерево
scores = []
depth = np.linspace(1,10,10)

clf = ensemble.RandomForestRegressor(max_depth=20, random_state=2, n_estimators=200)
clf.fit(train_data, y_train)
predictions = clf.predict(test_data)
#print predictions
print clf.feature_importances_

pyplot.plot(np.linspace(1,100,100),predictions[0:100])
pyplot.plot(np.linspace(1,100,100),y_test[0:100])
pyplot.show()
print metrics.mean_absolute_error(y_test,predictions)/y_test.mean()
print test_data
print len(X_all)


In [None]:

predictions = clf.predict(X[500000:len(X)])
print (predictions)
pyplot.figure(figsize=(30,30))
pyplot.plot(np.linspace(1,100,100),predictions[0:100], color = 'red')
pyplot.plot(np.linspace(1,100,100),y[500000:500100])



#predict_class = [ int(predictions[i-1]< predictions[i]) for i in range(1,len(predictions))]
#y_test_1 = [int(y[i-1] <y[i]) for i in range(70001,len(X))]
predict_class = []
y_test_1 =[]
for i in range(1,len(predictions)):
    if predictions[i-1]< predictions[i]:
        predict_class.append(2)
    if predictions[i-1]== predictions[i]:
        predict_class.append(1)
    if predictions[i-1]>predictions[i]:
        predict_class.append(0)
        
    if y[i-1+500000]<y[i+500000]:
        y_test_1.append(2)
    if y[i-1+500000]==y[i+500000]:
        y_test_1.append(1)
    if y[i-1+500000]>y[i+500000]:
        y_test_1.append(0)

#print metrics.accuracy_score(predict_class,y_test_1)
#print metrics.precision_score(predict_class,y_test_1)
#print metrics.recall_score(predict_class,y_test_1)
tp1 = 0
fp1 =0
#print y_test_1


#print predict_class
#print y_test_1
for i in range(len(predict_class)):
    if predict_class[i] == 2:
        if predict_class[i] == y_test_1[i]:
            tp1+=1
        if y_test_1[i] == 0:
            fp1+=1
        
        
print float(tp1)/(fp1+tp1)
        
    




In [None]:
print p,e