# Prediction with classifier

In [5]:
%matplotlib inline
import pandas as pd
from datetime import datetime
import numpy as np
import seaborn as sns
#import matplotlib.pyplot as plt
import matplotlib.pylab as pl
import xgboost
from model.models import TradeModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.grid_search import GridSearchCV
from pandas_ml import ConfusionMatrix
from service.files_service import _get_files



Given a df with trade features(ATR,Keltner,Donchian), forecast if the next period will close with a pip higher than 10.

In [6]:
all_files = _get_files(folder = 'data/all_data/resampled_D/',extension='.csv',as_dict=True,filter_on='_D_')

In [7]:
def make_prediction(tm,date):
    df = tm.df.copy()
    
    predictors =df.columns.tolist()
    df['target']=((df.Close-df.Close.shift(-1))*10000>=10)*1
    
    clf = xgboost.XGBClassifier()
    
    df_train = df[df.index<date]
    df_test = df[df.index==date]
      
    X_train,y_train = df_train[predictors].values,df_train.target.values
    X_test, y_test  = df_test[predictors].values,df_test.target.values
    
    clf = cv_optimize(clf,{},X_train,y_train)
    
    df_test['prediction'] = clf.predict(X_test)
    
    score =  clf.score(X_test,y_test)
    #print confusion_matrix(y_test,df_test['prediction'])
    #print classification_report(y_test,df_test['prediction'], target_names=['Not pip','pip'])
    #df_test[['prediction','target']].plot(kind='bar')
    
    return y_test[0],df_test['prediction'][0]

def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=-1)
    gs.fit(Xtrain, ytrain)
    
    return gs.best_estimator_    

In [8]:
def compute_confusion_matrix(tm):
    real_result = []
    predicted_result=[]
    test_sample=100
    for dt in (tm.df.tail(test_sample).index.values):
        real,predicted = make_prediction(tm, dt)
        real_result.append(real)
        predicted_result.append(predicted)
    return ConfusionMatrix(real_result, predicted_result)

In [9]:
confusion_matrixes = {}
for k,v in all_files.iteritems():
    tm =TradeModel('data/all_data/resampled_D/'+v, name=k, datetime_col='ds')
    confusion_matrixes[k]=compute_confusion_matrix(tm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
confusion_matrixes['resampled_D_BRENTCMDUSD']

Predicted   0   1  __all__
Actual                    
0          42  11       53
1          15  32       47
__all__    57  43      100

In [14]:
import pickle
pickle.dump(confusion_matrixes, open('data/confusion_matrix1.p', "wb"))


In [135]:
cmo = pickle.load(open('data/confusion_matrix1.p', "rb"))

In [136]:
df_CM_all = pd.DataFrame()
for k,v in cmo.iteritems():
    print v.stats_class.loc['PPV: Pos Pred Value (Precision)'][1]
    
    df_cm = cmo[k].to_dataframe()
    df_cm.index=[k +'_Actual_Negative',k +'_Actual_Positive']
    df_cm.columns=['Predicted_negative','Predicted_Positive']
    df_CM_all = pd.concat([df_CM_all,df_cm],axis=0)


0.666666666667
0.775
0.617647058824
0.745098039216
0.744186046512
0.641025641026
0.861111111111
0.655172413793


In [106]:
df_CM_all['Pip_10_was_right']

Unnamed: 0,Predicted_negative,Predicted_Positive
resampled_D_LIGHTCMDUSD_Actual_Negative,23,22
resampled_D_LIGHTCMDUSD_Actual_Positive,11,44
resampled_D_EURRUB_Actual_Negative,41,9
resampled_D_EURRUB_Actual_Positive,19,31
resampled_D_USDNOK_Actual_Negative,48,13
resampled_D_USDNOK_Actual_Positive,18,21
resampled_D_USDMXN_Actual_Negative,34,13
resampled_D_USDMXN_Actual_Positive,15,38
resampled_D_BRENTCMDUSD_Actual_Negative,42,11
resampled_D_BRENTCMDUSD_Actual_Positive,15,32


Unnamed: 0,Predicted_negative,Predicted_Positive
Actual_Negative,23,22
Actual_Positive,11,44
