# Read Offline from CSV

## Overview
Data manually downloaded from

 - **WSJ.com**
  
 - **Investing.com**

  
### Offline Data

| No | Symbol | Index Name | Source | URL Address |
| :--- | :----- | :----- | :---- | :----- |
| 1 | GC | gold_futures | investing | [Gold Futures Historical Data](https://www.investing.com/commodities/gold-historical-data) |
| 2 | DXY | usd_index | investing | [US Dollar Index Historical Data](https://www.investing.com/indices/usdollar-historical-data) |
| 3 | HG | copper_futures | investing | [Copper Futures Historical Data](https://www.investing.com/commodities/copper-historical-data) |
| 4 | COMP | nasdaq_comp | WSJ | [NASDAQ Composite Index](https://www.wsj.com/market-data/quotes/index/COMP/historical-prices) |
| 5 | DJIA | dow_jones | WSJ | [Dow Jones Industrial Average](https://www.wsj.com/market-data/quotes/index/DJIA/historical-prices) |
| 6 | SPX | sp500 | WSJ | [S&P 500 Index](https://www.wsj.com/market-data/quotes/index/SPX/historical-prices) |

### Data fields

WSJ: `['Date', 'Open', 'High', 'Low', 'Close']`

Investing:`['Date', 'Price', 'Open','High','Low','Vol','Change']`

## Import Libraries

In [1]:
# import necessary libraries
import pandas as pd
import os
import glob

import pickle

## Custom functions

In [2]:
def monthly_average(df):
    '''
    Get a dataframe with daily records 
    and return a new df with monthly average
    '''
    
    df['month'] = pd.DatetimeIndex(df.index).month
    df['year'] = pd.DatetimeIndex(df.index).year
    
    dfg = df.groupby(['year','month'], as_index=False).mean()
    dfg['Date'] = pd.to_datetime(dict(year = dfg.year, month = dfg.month, day = 1))
    
    dfg = dfg.drop(columns = ['month','year'])
    
    dfg.set_index('Date', drop=True, inplace=True)
    
    return dfg

## Set Directories

In [3]:
# Change directory one step back and save as the root directory
ROOT_DIR = os.path.normpath(os.getcwd() + os.sep + os.pardir)

# Change direcotry to the 'data\raw' folder
path = ROOT_DIR + '\\data\\raw\\'
print(path)

D:\gitProjects\WTI_Crude_Oil_Price_Prediction_Using_ML\data\raw\


In [4]:
# use glob to get all the csv files in the folder
csv_files = glob.glob(os.path.join(path, "*.csv"))
#dispaly csv file list
display(csv_files)

['D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\copper_futures.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\dow_jones.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\gold_futures.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\nasdaq_comp.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\sp500.csv',
 'D:\\gitProjects\\WTI_Crude_Oil_Price_Prediction_Using_ML\\data\\raw\\usd_index.csv']

## Load all CSV files

In [5]:
def get_file_name(file_name):
    '''
    Get full file address and return the name
    '''
    name_ext = file_name.split("\\")[-1]
    name = name_ext.split(".")[0]
    
    return name

In [6]:
# Initialize list; this is the final list that will store all the data from the json pull. 
read_data = []

# loop over the list of csv files
for f in csv_files:
    
    # read the csv file    
    df = pd.read_csv(f, sep = ',', thousands=',')
        
    if ' Close' in df.columns:
        #rename column name
        df.columns = ['Date', 'Open', 'High', 'Low', 'Close']
        value = 'Close'
        format_date = '%m/%d/%y'
        
        
    if 'Price' in df.columns:
        df.columns = ['Date', 'Price', 'Open','High','Low','Vol','Change']
        value = 'Price'
        format_date = '%m/%d/%Y'
    
    #select columns
    df = df[['Date', value]]

    #change data type to date and numeric
    df['value'] = pd.to_numeric(df[value])
    df['Date'] = pd.to_datetime(df['Date'], format=format_date, errors="raise")
    
    df = df.drop(columns = [value])
    df = df.rename(columns = {'value': get_file_name(f)}, errors="raise")
    
    #set Date as index
    df.set_index('Date', drop=True, inplace=True)
    
    print('Sucess:', get_file_name(f))
    display(df.head())
    
    read_data.append(df)
    # print the success report
    


Sucess: copper_futures


Unnamed: 0_level_0,copper_futures
Date,Unnamed: 1_level_1
2019-10-28,2.683
2019-10-25,2.6755
2019-10-24,2.668
2019-10-23,2.6715
2019-10-22,2.633


Sucess: dow_jones


Unnamed: 0_level_0,dow_jones
Date,Unnamed: 1_level_1
2023-12-29,37689.54
2023-12-28,37710.1
2023-12-27,37656.52
2023-12-26,37545.33
2023-12-22,37385.97


Sucess: gold_futures


Unnamed: 0_level_0,gold_futures
Date,Unnamed: 1_level_1
2019-08-13,1514.1
2019-08-12,1517.2
2019-08-09,1508.5
2019-08-08,1509.5
2019-08-07,1519.6


Sucess: nasdaq_comp


Unnamed: 0_level_0,nasdaq_comp
Date,Unnamed: 1_level_1
2023-12-29,15011.35
2023-12-28,15095.14
2023-12-27,15099.18
2023-12-26,15074.57
2023-12-22,14992.97


Sucess: sp500


Unnamed: 0_level_0,sp500
Date,Unnamed: 1_level_1
2023-12-29,4769.83
2023-12-28,4783.35
2023-12-27,4781.58
2023-12-26,4774.75
2023-12-22,4754.63


Sucess: usd_index


Unnamed: 0_level_0,usd_index
Date,Unnamed: 1_level_1
2019-05-10,97.33
2019-05-09,97.37
2019-05-08,97.62
2019-05-07,97.63
2019-05-06,97.52


## Data

### Print the features and sizes

In [7]:
for i in range(len(read_data)):
    name = read_data[i].columns[0]
    size = len(read_data[i])
    phrase = str(i) + ': '+ name + ' (1*' + str(size) + ')'
    print(phrase)

0: copper_futures (1*5000)
1: dow_jones (1*6037)
2: gold_futures (1*5000)
3: nasdaq_comp (1*6037)
4: sp500 (1*6037)
5: usd_index (1*5000)


### Merge_order all data frames

In [8]:
df = read_data[0]
dfg = monthly_average(df)
dfg.index = pd.to_datetime(dfg.index, format = '%Y%m%d',errors='coerce')

for df_temp in read_data[1:]:
    dfg_temp = monthly_average(df_temp)
    dfg_temp.index = pd.to_datetime(dfg_temp.index, format = '%Y%m%d',errors='coerce')
    dfg = pd.merge_ordered(dfg, dfg_temp, on = 'Date')

## Check the final dataframe

In [9]:
display(dfg.round(2))

Unnamed: 0,Date,copper_futures,dow_jones,gold_futures,nasdaq_comp,sp500,usd_index
0,2000-01-01,0.86,11281.26,285.86,4013.49,1425.59,101.71
1,2000-02-01,0.83,10541.93,302.42,4410.87,1388.87,104.43
2,2000-03-01,0.81,10483.39,287.15,4802.99,1442.21,105.54
3,2000-04-01,0.78,10944.31,281.97,3863.64,1461.36,106.76
4,2000-05-01,0.83,10580.27,276.03,3528.42,1418.48,110.79
...,...,...,...,...,...,...,...
283,2023-08-01,,34880.66,,13743.21,4457.36,
284,2023-09-01,,34318.89,,13585.84,4409.09,
285,2023-10-01,,33319.49,,13199.43,4269.40,
286,2023-11-01,,34704.50,,13913.16,4460.06,


## Store the data

In [10]:
# Change direcotry to the 'data\raw' folder
save_dir = ROOT_DIR + '\\data\\interim\\'

# Set a file name
file_save = save_dir + 'offline_data.csv'
dfg.to_csv(file_save)

print('Save:\n', file_save)

Save:
 D:\gitProjects\WTI_Crude_Oil_Price_Prediction_Using_ML\data\interim\offline_data.csv
