# Candlestick formation - batched
in this notebook we will take our raw data and do the following:

1. Import and review the data - check if there are any missing values, etc 
2. Test a sample candle stick formation using our ```candle_stick.py``` script 
3. Creating our candlestick data using the same script and saving it in our ```model_data``` directory under ```image_data```

In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import gc

from mpl_finance import candlestick_ochl
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from time import sleep
import multiprocessing as mp

In [3]:
# grabbing our file
file = f'{os.curdir}/raw_data/'
file = f'{file}/{os.listdir(file)[0]}' # grabbing our first file - initial download

file

'./raw_data//GBP_USD_H1_2016-01-01_2018-01-01.csv'

In [14]:
# loading our data into a dataframe
headers = ['date', 'complete', 'open', 'high', 'low', 'close', 'volume']

my_dtypes = {
    'date': 'str',
    'complete': 'bool',
    'open': 'float',
    'high': 'float',
    'low': 'float',
    'close': 'float',
    'volume': 'float'
}

my_parse_dates = ['date']

df = pd.read_csv(file, names=headers, dtype=my_dtypes, parse_dates=my_parse_dates)

In [15]:
# dropping complete column
df.drop(columns='complete', axis=1, inplace=True)

# # re-arranging
df = df[['date', 'open', 'high', 'low', 'close', 'volume']]

In [16]:
df.head()

Unnamed: 0,date,open,high,low,close,volume
0,2016-01-03 22:00:00,1.47352,1.47464,1.47351,1.47354,1068.0
1,2016-01-03 23:00:00,1.47343,1.47382,1.47266,1.47356,938.0
2,2016-01-04 00:00:00,1.47364,1.47412,1.4724,1.47278,570.0
3,2016-01-04 01:00:00,1.47284,1.47301,1.46936,1.47177,1133.0
4,2016-01-04 02:00:00,1.47183,1.47234,1.47041,1.47184,865.0


In [17]:
df.tail()

Unnamed: 0,date,open,high,low,close,volume
12403,2017-12-29 17:00:00,1.35278,1.35362,1.35224,1.3524,3945.0
12404,2017-12-29 18:00:00,1.35242,1.35254,1.35181,1.35248,2445.0
12405,2017-12-29 19:00:00,1.3524,1.35248,1.35158,1.35178,2180.0
12406,2017-12-29 20:00:00,1.35178,1.3518,1.35008,1.3501,2257.0
12407,2017-12-29 21:00:00,1.35012,1.35112,1.34951,1.35045,2593.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12408 entries, 0 to 12407
Data columns (total 6 columns):
date      12408 non-null datetime64[ns]
open      12408 non-null float64
high      12408 non-null float64
low       12408 non-null float64
close     12408 non-null float64
volume    12408 non-null float64
dtypes: datetime64[ns](1), float64(5)
memory usage: 581.7 KB


In [19]:
# Setting date to datetime
df['date'] = pd.to_datetime(df['date'])

In [21]:
# Splitting our batch in two - this will be much larger batch splits and inside an individual script as the larger projects will consist of many more picture 
batch_size = int(len(df) / 2)
batch_size

6204

In [22]:
# Splitting our DFs into two DFs
# In production, this will be automatic depending on how many batch splits
df1 = df.iloc[:batch_size].copy()
df2 = df.iloc[batch_size:].copy()

In [25]:
df1.tail(1)

Unnamed: 0,date,open,high,low,close,volume
6203,2016-12-30 10:00:00,1.23124,1.23162,1.22874,1.23042,2194.0


In [26]:
df2.head(1)

Unnamed: 0,date,open,high,low,close,volume
6204,2016-12-30 11:00:00,1.23037,1.23101,1.22864,1.23026,1055.0


In [27]:
df2.tail(1)

Unnamed: 0,date,open,high,low,close,volume
12407,2017-12-29 21:00:00,1.35012,1.35112,1.34951,1.35045,2593.0


# Testing our Plotting - using ```omegacandlestick.py```