# Data Challenge : Prediction of stock market Volume
By <i>Fabrice ZAPFACK</i> & <i>Basile CALDERAN</i>

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [5]:
trainOutPut=pd.read_csv('data/challenge_output_data_training_file_prediction_of'
                      '_transaction_volumes_in_financial_markets.csv', sep=';')

## Import Data

In [3]:
trainDF = pd.read_csv('data/training_input.csv')

In [4]:
testDF = pd.read_csv('data/testing_input.csv')

In [6]:
trainDF['date']=pd.to_datetime(trainDF['date'],unit='D')

In [7]:
testDF['date']=pd.to_datetime(testDF['date'],unit='D')

In [None]:
trainDF['year'] = trainDF['date'].dt.year
trainDF['month'] = trainDF['date'].dt.month
trainDF['day'] = trainDF['date'].dt.day
trainDF['weekday'] = trainDF['date'].dt.weekday
trainDF['week'] = trainDF['date'].dt.week
trainDF['n_days'] = trainDF['date'].apply(lambda date: (date - pd.to_datetime("1970-01-01")).days)
        
trainDF = trainDF.join(pd.get_dummies(trainDF['year'], prefix='y'))
trainDF = trainDF.join(pd.get_dummies(trainDF['month'], prefix='m'))
trainDF = trainDF.join(pd.get_dummies(trainDF['day'], prefix='d'))
trainDF = trainDF.join(pd.get_dummies(trainDF['weekday'], prefix='wd'))
trainDF = trainDF.join(pd.get_dummies(trainDF['week'], prefix='w'))


In [None]:
testDF['year'] = testDF['date'].dt.year
testDF['month'] = testDF['date'].dt.month
testDF['day'] = testDF['date'].dt.day
testDF['weekday'] = testDF['date'].dt.weekday
testDF['week'] = testDF['date'].dt.week
testDF['n_days'] = testDF['date'].apply(lambda date: (date - pd.to_datetime("1970-01-01")).days)
        
testDF = testDF.join(pd.get_dummies(testDF['year'], prefix='y'))
testDF = testDF.join(pd.get_dummies(testDF['month'], prefix='m'))
testDF = testDF.join(pd.get_dummies(testDF['day'], prefix='d'))
testDF = testDF.join(pd.get_dummies(testDF['weekday'], prefix='wd'))
testDF = testDF.join(pd.get_dummies(testDF['week'], prefix='w'))

In [None]:
trainDF=trainDF.drop(["year",'month','day','weekday','week'],axis=1)

In [None]:
testDF=testDF.drop(["year",'month','day','weekday','week'],axis=1)

In [None]:
trainDF2=trainDF.drop(["ID",'date','product_id'],axis=1)


In [None]:
#testDF=testDF.sort_values(by=["ID"],axis=0)

In [12]:
idsFrame=testDF['ID']

In [19]:
idsFrame.head()

0    618557
1    618558
2    618559
3    618560
4    618561
Name: ID, dtype: int64

In [14]:
testDF2=testDF.drop(["ID",'date','product_id'],axis=1)

In [15]:
trainDF2[trainDF2<0]=np.nan

In [16]:
testDF2[testDF2<0]=np.nan

In [17]:
for i in range(trainDF2.shape[0]):
    if np.isnan(trainDF2.iloc[i,:]).sum()/53>=0.8:
        trainDF2.loc[i,:]=trainDF2.mean()
trainDF2.shape

(613220, 164)

In [18]:
for i in range(testDF2.shape[0]):
    if np.isnan(testDF2.iloc[i,:]).sum()/53>=0.8:
        testDF2.loc[i,:]=testDF2.mean()
testDF2.shape

(614098, 164)

In [20]:
InterpolTrain=trainDF2.interpolate(method='linear',limit_direction='both',limit=10000, axis=1)

In [None]:
#Interpol=pd.concat([trainOutPut["ID"],InterpolTrain],axis=1)

In [21]:
InterpolTest=testDF2.interpolate(method='linear',limit_direction='both',limit=100000, axis=1)

In [None]:
#FinalInterpol=pd.merge(Interpol,trainOutPut,on=["ID"])

# Attention : ici pour submit

In [22]:
featuresTrain = InterpolTrain.drop(['09:30:00'], axis=1)
#featuresTrain = InterpolTrain
featuresTest =  InterpolTest.drop(['09:30:00'], axis=1)
X_train = featuresTrain.values
y_train = trainOutPut['TARGET'].values
X_test = featuresTest.values

In [23]:
print X_train.shape
print y_train.shape
print X_test.shape


(613220, 163)
(613220,)
(614098, 163)


In [24]:
import xgboost as xgb
param ={}  
param['objective'] = 'reg:linear'
param['eta'] = 0.09
param['max_depth'] = 100
param['silent'] = 1
param['nthread'] = 4
xg_train = xgb.DMatrix( X_train, label=y_train)
xg_test = xgb.DMatrix(X_test)

def evalerror(preds,xg_train):
    y_test = xg_train.get_label()
    return 'error',  np.mean(np.abs((preds-y_test ) / y_test)) * 100


watchlist = [ (xg_train,'train') ]
num_round = 10
bst = xgb.train(param, xg_train, num_round, watchlist ,feval=evalerror);
#bst = xgb.train(param, xg_train, num_round, watchlist );
# get prediction
y_pred = bst.predict( xg_test );
#print np.mean(np.abs((y_pred-y_test ) / y_test)) * 100
#nround = 50
#bst = xgb(param=param, data = X_train, label = y_train, nrounds=nround)
#y_pred = bst.predict( xg_test );

[0]	train-error:90.964347
[1]	train-error:82.795686
[2]	train-error:75.411004
[3]	train-error:68.726921
[4]	train-error:62.676269
[5]	train-error:57.189232
[6]	train-error:52.209294
[7]	train-error:47.685689
[8]	train-error:43.573239
[9]	train-error:39.831725


In [27]:
outFrame=pd.DataFrame(y_pred, columns=['TARGET'])
outFrame=pd.concat([idsFrame, outFrame], axis=1)
outFrame.to_csv('out.csv',index=False)

# Attention : ici pour test

In [None]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.3, random_state=0)
print X_train.shape
print y_train.shape

In [None]:
import xgboost as xgb
param ={}  
param['objective'] = 'reg:linear'
param['eta'] = 0.09
param['max_depth'] = 100
param['silent'] = 1
param['nthread'] = 4
xg_train = xgb.DMatrix( X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)

def evalerror(preds,xg_train):
    y_test = xg_train.get_label()
    return 'error',  np.mean(np.abs((preds-y_test ) / y_test)) * 100


watchlist = [ (xg_train,'train') ]
num_round = 10
bst = xgb.train(param, xg_train, num_round, watchlist ,feval=evalerror);
#bst = xgb.train(param, xg_train, num_round, watchlist );
# get prediction
y_pred = bst.predict( xg_test );
#print np.mean(np.abs((y_pred-y_test ) / y_test)) * 100
#nround = 50
#bst = xgb(param=param, data = X_train, label = y_train, nrounds=nround)
#y_pred = bst.predict( xg_test );

In [None]:
print np.mean(np.abs((y_pred-y_test ) / y_test)) * 100