# Read Offline from CSV

## Overview
Data manually downloaded from

 - **WSJ.com**
  
 - **Investing.com**

  
### Offline Data

| No | Symbol | Index Name | Source | URL Address |
| :--- | :----- | :----- | :---- | :----- |
| 1 | GCM3 | Gold Futures - Jun 23 | Investing | https://www.investing.com/commodities/gold-historical-data |
| 2 | DXY | US Dollar Index  | Investing | https://www.investing.com/indices/usdollar-historical-data |
| 3 | HGK3 | Copper Futures - May 23 | Investing | https://www.investing.com/commodities/copper-historical-data |
| 4 | COMP | NASDAQ Composite Index | WSJ | https://www.wsj.com/market-data/quotes/index/COMP/historical-prices |
| 5 | DJIA | Dow Jones Industrial Average | WSJ | https://www.wsj.com/market-data/quotes/index/DJIA/historical-prices |
| 6 | SPX | S&P 500 index | WSJ | https://www.wsj.com/market-data/quotes/index/SPX/historical-prices |

---
### Data fields

WSJ: `['Date', 'Open', 'High', 'Low', 'Close']`

Investing:`['Date', 'Price', 'Open','High','Low','Vol','Change']`

## Import Libraries

In [1]:
# import necessary libraries
import pandas as pd
import os
import glob

import pickle

## Custom functions

In [2]:
def monthly_average(df):
    '''
    Get a dataframe with daily records 
    and return a new df with monthly average
    '''
    
    df['month'] = pd.DatetimeIndex(df.index).month
    df['year'] = pd.DatetimeIndex(df.index).year
    
    dfg = df.groupby(['year','month'], as_index=False).mean()
    dfg['Date'] = pd.to_datetime(dict(year = dfg.year, month = dfg.month, day = 1))
    
    dfg = dfg.drop(columns = ['month','year'])
    
    dfg.set_index('Date', drop=True, inplace=True)
    
    return dfg

## Set Directories

In [3]:
# Change directory one step back and save as the root directory
ROOT_DIR = os.path.normpath(os.getcwd() + os.sep + os.pardir)

# Change direcotry to the 'data\raw' folder
path = ROOT_DIR + '\\data\\raw\\'
print(path)

D:\gitProjects\WTI_Crude_Oil_Price_Prediction_Using_ML\data\raw\


In [4]:
# use glob to get all the csv files in the folder
csv_files = glob.glob(os.path.join(path, "*.csv"))
#dispaly csv file list
display(csv_files)

['D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\COMP.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\DJIA.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\DXY.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\GCM3.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\HGK3.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\SPX.csv']

## Load all CSV files

In [5]:
def get_file_name(file_name):
    '''
    Get full file address and return the name
    '''
    name_ext = file_name.split("\\")[-1]
    name = name_ext.split(".")[0]
    
    return name

In [6]:
# Initialize list; this is the final list that will store all the data from the json pull. 
read_data = []

# loop over the list of csv files
for f in csv_files:
    
    # read the csv file    
    df = pd.read_csv(f, sep = ',', thousands=',')
        
    if ' Close' in df.columns:
        #rename column name
        df.columns = ['Date', 'Open', 'High', 'Low', 'Close']
        value = 'Close'
        format_date = '%m/%d/%y'
        
        
    if 'Price' in df.columns:
        df.columns = ['Date', 'Price', 'Open','High','Low','Vol','Change']
        value = 'Price'
        format_date = '%m/%d/%Y'
    
    #select columns
    df = df[['Date', value]]

    #change data type to date and numeric
    df['value'] = pd.to_numeric(df[value])
    df['Date'] = pd.to_datetime(df['Date'], format=format_date, errors="raise")
    
    df = df.drop(columns = [value])
    df = df.rename(columns = {'value': get_file_name(f)}, errors="raise")
    
    #set Date as index
    df.set_index('Date', drop=True, inplace=True)
    
    print('Sucess:', get_file_name(f))
    display(df.head())
    
    read_data.append(df)
    # print the success report
    


Sucess: COMP


Unnamed: 0_level_0,COMP
Date,Unnamed: 1_level_1
2023-04-14,12123.47
2023-04-13,12166.27
2023-04-12,11929.34
2023-04-11,12031.88
2023-04-10,12084.36


Sucess: DJIA


Unnamed: 0_level_0,DJIA
Date,Unnamed: 1_level_1
2023-04-14,33886.47
2023-04-13,34029.69
2023-04-12,33646.5
2023-04-11,33684.79
2023-04-10,33586.52


Sucess: DXY


Unnamed: 0_level_0,DXY
Date,Unnamed: 1_level_1
2023-04-01,101.55
2023-03-01,102.51
2023-02-01,104.95
2023-01-01,102.1
2022-12-01,103.52


Sucess: GCM3


Unnamed: 0_level_0,GCM3
Date,Unnamed: 1_level_1
2023-04-01,2002.2
2023-03-01,1969.0
2023-02-01,1853.2
2023-01-01,1962.2
2022-12-01,1842.2


Sucess: HGK3


Unnamed: 0_level_0,HGK3
Date,Unnamed: 1_level_1
2023-04-01,4.1065
2023-03-01,4.0945
2023-02-01,4.074
2023-01-01,4.226
2022-12-01,3.8105


Sucess: SPX


Unnamed: 0_level_0,SPX
Date,Unnamed: 1_level_1
2023-04-14,4137.64
2023-04-13,4146.22
2023-04-12,4091.95
2023-04-11,4108.94
2023-04-10,4109.11


## Data

### Print the features and sizes

In [7]:
for i in range(len(read_data)):
    name = read_data[i].columns[0]
    size = len(read_data[i])
    phrase = str(i) + ': '+ name + ' (1*' + str(size) + ')'
    print(phrase)

0: COMP (1*5858)
1: DJIA (1*5858)
2: DXY (1*280)
3: GCM3 (1*280)
4: HGK3 (1*280)
5: SPX (1*5858)


### Merge_order all data frames

In [8]:
df = read_data[0]
dfg = monthly_average(df)
dfg.index = pd.to_datetime(dfg.index, format = '%Y%m%d',errors='coerce')

for df_temp in read_data[1:]:
    dfg_temp = monthly_average(df_temp)
    dfg_temp.index = pd.to_datetime(dfg_temp.index, format = '%Y%m%d',errors='coerce')
    dfg = pd.merge_ordered(dfg, dfg_temp, on = 'Date')

## Check the final dataframe

In [9]:
display(dfg.round(2))

Unnamed: 0,Date,COMP,DJIA,DXY,GCM3,HGK3,SPX
0,2000-01-01,4013.49,11281.26,105.13,286.2,0.85,1425.59
1,2000-02-01,4410.87,10541.93,105.92,295.4,0.80,1388.87
2,2000-03-01,4802.99,10483.39,105.44,281.4,0.81,1442.21
3,2000-04-01,3863.64,10944.31,110.14,276.0,0.80,1461.36
4,2000-05-01,3528.42,10580.27,108.74,274.8,0.82,1418.48
...,...,...,...,...,...,...,...
275,2022-12-01,10839.00,33482.26,103.52,1842.2,3.81,3912.38
276,2023-01-01,11013.99,33656.00,102.10,1962.2,4.23,3960.66
277,2023-02-01,11785.07,33648.26,104.95,1853.2,4.07,4079.68
278,2023-03-01,11637.09,32483.48,102.51,1969.0,4.09,3968.56


## Store the data

In [10]:
# Change direcotry to the 'data\raw' folder
save_dir = ROOT_DIR + '\\data\\interim\\'

# Set a file name
file_save = save_dir + 'offline_data.csv'
dfg.to_csv(file_save)

print('Save:\n', file_save)

Save:
 D:\gitProjects\WTI_Crude_Oil_Price_Prediction_Using_ML\data\interim\offline_data.csv
