# Read Offline from CSV

## Overview
Data manually downloaded from

 - **WSJ.com**
  
 - **Investing.com**

  
### Offline Data

| No | Symbol | Index Name | Source | URL Address |
| :--- | :----- | :----- | :---- | :----- |
| 1 | COMEX_Gold | Gold: Future closing price | Investing | https://www.investing.com/commodities/gold-historical-data |
| 2 | DXY | US Dollar Index  | Investing | https://www.investing.com/indices/usdollar-historical-data |
| 3 | LME_Copper | Copper: Future closing price | Investing | https://www.investing.com/commodities/copper-historical-data |
| 4 | COMP | NASDAQ index | WSJ | https://www.wsj.com/market-data/quotes/index/COMP/historical-prices |
| 5 | DJIA | Dow Jones Industrial Index | WSJ | https://www.wsj.com/market-data/quotes/index/DJIA/historical-prices |
| 6 | SPX | S&P 500 index | WSJ | https://www.wsj.com/market-data/quotes/index/SPX/historical-prices |


---
### Data fields

WSJ: `['Date', 'Open', 'High', 'Low', 'Close']`

Investing:`['Date', 'Price', 'Open','High','Low','Vol','Change']`

## Import Libraries

In [1]:
# import necessary libraries
import pandas as pd
import os
import glob

import pickle

## Custom functions

In [2]:
def monthly_average(df):
    '''
    Get a dataframe with daily records 
    and return a new df with monthly average
    '''
    
    df['month'] = pd.DatetimeIndex(df.index).month
    df['year'] = pd.DatetimeIndex(df.index).year
    
    dfg = df.groupby(['year','month'], as_index=False).mean()
    dfg['Date'] = pd.to_datetime(dict(year = dfg.year, month = dfg.month, day = 1))
    
    dfg = dfg.drop(columns = ['month','year'])
    
    dfg.set_index('Date', drop=True, inplace=True)
    
    return dfg

## Set Directories

In [3]:
# Change directory one step back and save as the root directory
ROOT_DIR = os.path.normpath(os.getcwd() + os.sep + os.pardir)

# Change direcotry to the 'data\raw' folder
path = ROOT_DIR + '\\data\\raw\\'
print(path)

D:\gitProjects\springboard_capstone_1\Springboard_Capstone_01\data\raw\


In [4]:
# use glob to get all the csv files in the folder
csv_files = glob.glob(os.path.join(path, "*.csv"))
#dispaly csv file list
display(csv_files)

['D:\\gitProjects\\springboard_capstone_1\\Springboard_Capstone_01\\data\\raw\\comp.csv',
 'D:\\gitProjects\\springboard_capstone_1\\Springboard_Capstone_01\\data\\raw\\copper_futures.csv',
 'D:\\gitProjects\\springboard_capstone_1\\Springboard_Capstone_01\\data\\raw\\djia.csv',
 'D:\\gitProjects\\springboard_capstone_1\\Springboard_Capstone_01\\data\\raw\\dxy.csv',
 'D:\\gitProjects\\springboard_capstone_1\\Springboard_Capstone_01\\data\\raw\\gold_futures.csv',
 'D:\\gitProjects\\springboard_capstone_1\\Springboard_Capstone_01\\data\\raw\\spx.csv']

## Load all CSV files

In [5]:
def get_file_name(file_name):
    '''
    Get full file address and return the name
    '''
    name_ext = file_name.split("\\")[-1]
    name = name_ext.split(".")[0]
    
    return name

In [6]:
# Initialize list; this is the final list that will store all the data from the json pull. 
read_data = []

# loop over the list of csv files
for f in csv_files:
        
    # read the csv file    
    df = pd.read_csv(f, sep = ',', thousands=',')
    
    if ' Close' in df.columns:
        #rename column name
        df.columns = ['Date', 'Open', 'High', 'Low', 'Close']
        value = 'Close'
        format_date = '%m/%d/%Y'
        
        
    if 'Price' in df.columns:
        df.columns = ['Date', 'Price', 'Open','High','Low','Vol','Change']
        value = 'Price'
        format_date = '%m/%d/%Y'
    
    #select columns
    df = df[['Date', value]]

    #change data type to date and numeric
    df['value'] = pd.to_numeric(df[value])
    df['Date'] = pd.to_datetime(df['Date'], format=format_date, errors="raise")
    
    df = df.drop(columns = [value])
    df = df.rename(columns = {'value': get_file_name(f)}, errors="raise")
    
    #set Date as index
    df.set_index('Date', drop=True, inplace=True)
    
    print('Sucess:', get_file_name(f))
    display(df.head())
    
    read_data.append(df)
    # print the success report
    


Sucess: comp


Unnamed: 0_level_0,comp
Date,Unnamed: 1_level_1
2022-11-03,10342.94
2022-11-02,10524.8
2022-11-01,10890.85
2022-10-31,10988.15
2022-10-28,11102.45


Sucess: copper_futures


Unnamed: 0_level_0,copper_futures
Date,Unnamed: 1_level_1
2022-11-01,3.4722
2022-10-01,3.375
2022-09-01,3.4125
2022-08-01,3.519
2022-07-01,3.575


Sucess: djia


Unnamed: 0_level_0,djia
Date,Unnamed: 1_level_1
2022-11-03,32001.25
2022-11-02,32147.76
2022-11-01,32653.2
2022-10-31,32732.95
2022-10-28,32861.8


Sucess: dxy


Unnamed: 0_level_0,dxy
Date,Unnamed: 1_level_1
2022-11-01,112.59
2022-10-01,111.54
2022-09-01,112.12
2022-08-01,108.85
2022-07-01,106.0


Sucess: gold_futures


Unnamed: 0_level_0,gold_futures
Date,Unnamed: 1_level_1
2022-11-01,1642.65
2022-10-01,1640.7
2022-09-01,1672.0
2022-08-01,1721.4
2022-07-01,1771.5


Sucess: spx


Unnamed: 0_level_0,spx
Date,Unnamed: 1_level_1
2022-11-03,3719.89
2022-11-02,3759.69
2022-11-01,3856.1
2022-10-31,3871.98
2022-10-28,3901.06


## Data

### Print the features and sizes

In [7]:
for i in range(len(read_data)):
    name = read_data[i].columns[0]
    size = len(read_data[i])
    phrase = str(i) + ': '+ name + ' (1*' + str(size) + ')'
    print(phrase)

0: comp (1*5749)
1: copper_futures (1*275)
2: djia (1*5748)
3: dxy (1*275)
4: gold_futures (1*275)
5: spx (1*5748)


### Merge_order all data frames

In [8]:
df = read_data[0]
dfg = monthly_average(df)
dfg.index = pd.to_datetime(dfg.index, format = '%Y%m%d',errors='coerce')

for df_temp in read_data[1:]:
    dfg_temp = monthly_average(df_temp)
    dfg_temp.index = pd.to_datetime(dfg_temp.index, format = '%Y%m%d',errors='coerce')
    dfg = pd.merge_ordered(dfg, dfg_temp, on = 'Date')

## Check the final dataframe

In [9]:
display(dfg.round(2))

Unnamed: 0,Date,comp,copper_futures,djia,dxy,gold_futures,spx
0,2000-01-01,4013.49,0.85,11281.26,105.13,286.20,1425.59
1,2000-02-01,4410.87,0.80,10541.93,105.92,295.40,1388.87
2,2000-03-01,4802.99,0.81,10483.39,105.44,281.40,1442.21
3,2000-04-01,3863.64,0.80,10944.31,110.14,276.00,1461.36
4,2000-05-01,3528.42,0.82,10580.27,108.74,274.80,1418.48
...,...,...,...,...,...,...,...
270,2022-07-01,11622.63,3.58,31535.32,106.00,1771.50,3911.73
271,2022-08-01,12570.26,3.52,33009.56,108.85,1721.40,4158.56
272,2022-09-01,11413.21,3.41,30649.56,112.12,1672.00,3850.52
273,2022-10-01,10801.49,3.38,30570.68,111.54,1640.70,3726.05


## Store the data

In [10]:
# Change direcotry to the 'data\raw' folder
save_dir = ROOT_DIR + '\\data\\interim\\'

# Set a file name
file_save = save_dir + 'offline_data_raw.csv'
dfg.to_csv(file_save)

print('Save:\n', file_save)

Save:
 D:\gitProjects\springboard_capstone_1\Springboard_Capstone_01\data\interim\offline_data_raw.csv
