#### 지표 UPDATE 
   - 주요 주식시장 지수 : spx, dji, ixix, kospi, kosdaq
   - 반도체지수, VIX 지수: sox, vix
   - 미국 채권 지수, 한국 채권 지수
   - 원화 환율
   - 선물 : 미국 3대 지수, wti, 달러지수 선물

In [1]:
# from bs4 import BeautifulSoup as bs
import requests
import datetime, time
import pickle

import pandas as pd
import numpy as np

import os
import shutil

import matplotlib.pyplot as plt

In [2]:
# !pip install cfscrape # 403 forbidden, cloudflare error을 해결하기 위한 모듈
import cfscrape
scraper = cfscrape.create_scraper()
# 이후 403 error이 발생한 곳에는 requests 대신 scraper 사용

In [3]:
headers = {'User-Agent': 'Mozilla/6.0 (Macintosh; Intel Mac OS X 10_11_5) \
           AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

In [4]:
def correct_date_format(df):
# 시간, 미국, 한국 접속사이트에 따라 attribute 가 변경되서 나타나기 때문에 error  처리를 위해 try 사용
    try:
        df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%b %d, %Y"))
    except:
        try:
            df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%m/%d/%Y"))
        except:
            try:
                df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%Y- %m- %d"))
            except:
                df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%Y년 %m월 %d일"))
    
    return df

In [5]:
def create_directory(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print("Error: Failed to create the directory.")

In [6]:
def make_pickle(df, pkl_name):
    pkl_directory = 'data/common_pkl/'
    try:
        if not os.path.exists(pkl_directory):
            os.makedirs(pkl_directory)
    except OSError:
        print("Error: Failed to create the directory.")

    # 데이터 저장: ../data/spx.pkl
    df.to_pickle(pkl_directory+pkl_name)
    df.to_csv(pkl_directory+pkl_name.replace('pkl','csv'))

In [7]:
def read_pickle(pkl_name):
# 데이터 로드
    pkl_directory = 'data/common_pkl/'
    df = pd.read_pickle(pkl_directory+pkl_name)
    
    return df

In [8]:
def get_data(url, column):
           
    count = 0
    while True:
        try :
            res = scraper.get(url, headers=headers)
            df = pd.read_html(res.text, attrs={"id": "curr_table"}, flavor=["lxml", "bs4"])[0]
            break
        except:
            res = scraper.get(url, headers=headers)
            df = pd.read_html(res.text, attrs={"data-test": "historical-data-table"}, flavor=["lxml", "bs4"])[0]
            break
        finally:
            time.sleep(1)
            count += 1
            if count > 5 :
                raise ValueError('The url request is delaying')
                break           

    df.columns = column
    correct_date_format(df)
    df.sort_values(by=[df.columns[0]], inplace=True)
    df.index = np.arange(0, len(df))  # 일련 번호 오름차순으로 재 설정

    return df

In [9]:
def concat_df(df_o, df):
    df_o = pd.concat([df_o, df], ignore_index=True)
    df_o.drop_duplicates(subset=['date'], keep='last', inplace=True)
#     df_o.drop_duplicates(subset=['date'], keep='first', inplace=True)
    df_o.sort_values(by=[df_o.columns[0]], inplace=True)
    df_o.index = np.arange(0, len(df_o))  # 일련 번호 오름차순으로 재 설정
    return df_o

In [10]:
def update_pickle(df, pkl_name):
    df_o = read_pickle(pkl_name)
    df_o = concat_df(df_o, df)
    
    make_pickle(df_o, pkl_name)

In [11]:
spx_url = 'https://kr.investing.com/indices/us-spx-500-historical-data'
spx = ['date', 'spx', 'open', 'high', 'low', 'volume', 'spx_cr']
pkl_name = 'spx.pkl'
df = get_data(spx_url,spx)

update_pickle(df, pkl_name)

In [12]:
dji_url = 'https://kr.investing.com/indices/us-30-historical-data'
dji = ['date', 'dji', 'open', 'high', 'low', 'volume', 'dji_cr']
pkl_name = 'dji.pkl'
df = get_data(dji_url,dji)

update_pickle(df, pkl_name)

In [13]:
nas_url = 'https://kr.investing.com/indices/nasdaq-composite-historical-data'
ixic = ['date', 'ixic', 'open', 'high', 'low', 'volume', 'ixic_cr']
pkl_name = 'nas.pkl'
df = get_data(nas_url,ixic)

update_pickle(df, pkl_name)

In [14]:
kospi_url = 'https://kr.investing.com/indices/kospi-historical-data'
kospi = ['date', 'kospi', 'open', 'high', 'low', 'volume', 'kospi_cr']
pkl_name = 'kospi.pkl'
df = get_data(kospi_url,kospi)

update_pickle(df, pkl_name)

In [15]:
kosdaq_url = 'https://kr.investing.com/indices/kosdaq-historical-data'
kosdaq = ['date', 'kosdaq', 'open', 'high', 'low', 'volume', 'kosdaq_cr']
pkl_name = 'kosdaq.pkl'
df = get_data(kosdaq_url,kosdaq)

update_pickle(df, pkl_name)

In [16]:
sox_url = 'https://kr.investing.com/indices/phlx-semiconductor-historical-data'
sox = ['date', 'sox', 'open', 'high', 'low', 'volume', 'sox_cr']
pkl_name = 'sox.pkl'
df = get_data(sox_url,sox)

update_pickle(df, pkl_name)

In [17]:
vix_url = 'https://kr.investing.com/indices/volatility-s-p-500-historical-data'
vix = ['date', 'vix', 'open', 'high', 'low', 'volume', 'vix_cr']
pkl_name = 'vix.pkl'
df = get_data(vix_url,vix)

update_pickle(df, pkl_name)

In [18]:
us_bond_10yr_url = 'https://kr.investing.com/rates-bonds/u.s.-10-year-bond-yield-historical-data'
us_10yr = ['date', 'bond_usa_10', 'open', 'high', 'low', 'bond_usa_10_cr']
pkl_name = 'us_10yr_bond.pkl'
df = get_data(us_bond_10yr_url,us_10yr)

update_pickle(df, pkl_name)

In [19]:
us_bond_2yr_url = 'https://kr.investing.com/rates-bonds/u.s.-2-year-bond-yield-historical-data'
us_2yr = ['date', 'bond_usa_2', 'open', 'high', 'low', 'bond_usa_2_cr']
pkl_name = 'us_2yr_bond.pkl'
df = get_data(us_bond_2yr_url,us_2yr)

update_pickle(df, pkl_name)

In [20]:
us_bond_3mon_url = 'https://kr.investing.com/rates-bonds/u.s.-3-month-bond-yield-historical-data'
us_3mon = ['date', 'bond_usa_3m', 'open', 'high', 'low', 'bond_usa_3m_cr']
pkl_name = 'us_3mon_bond.pkl'
df = get_data(us_bond_3mon_url,us_3mon)

update_pickle(df, pkl_name)

In [21]:
kor_bond_10yr_url = 'https://kr.investing.com/rates-bonds/south-korea-10-year-bond-yield-historical-data'
kor_10yr = ['date', 'bond_kor_10', 'open', 'high', 'low', 'bond_kor_10_cr']
pkl_name = 'kor_10yr_bond.pkl'
df = get_data(kor_bond_10yr_url,kor_10yr)

update_pickle(df, pkl_name)

In [22]:
kor_bond_2yr_url = 'https://kr.investing.com/rates-bonds/south-korea-2-year-bond-yield-historical-data'
kor_2yr = ['date', 'bond_kor_2', 'open', 'high', 'low','bond_kor_2_cr']
pkl_name = 'kor_2yr_bond.pkl'
df = get_data(kor_bond_2yr_url,kor_2yr)

update_pickle(df, pkl_name)

In [23]:
krw_rate_url = 'https://kr.investing.com/currencies/usd-krw-historical-data'
krw_rate = ['date', 'krw', 'open', 'high', 'low', 'vol', 'krw_cr']
pkl_name = 'krw_rate.pkl'
df = get_data(krw_rate_url,krw_rate)

update_pickle(df, pkl_name)

In [24]:
nas_futures_url = 'https://kr.investing.com/indices/nq-100-futures-historical-data'
ixic_future = ['date', 'ixic_f', 'open', 'high', 'low', 'volume', 'ixic_f_cr']
pkl_name = 'ixic_future.pkl'
df = get_data(nas_futures_url,ixic_future)

update_pickle(df, pkl_name)

In [25]:
snp_futures_url = 'https://kr.investing.com/indices/us-spx-500-futures-historical-data'
snp_future = ['date', 'spx_f', 'open', 'high', 'low', 'volume', 'spx_f_cr']
pkl_name = 'snp_future.pkl'
df = get_data(snp_futures_url,snp_future)

update_pickle(df, pkl_name)

In [26]:
dow_futures_url = 'https://kr.investing.com/indices/us-30-futures-historical-data'
dji_future = ['date', 'dji_f', 'open', 'high', 'low', 'volume', 'dji_f_cr']
pkl_name = 'dji_future.pkl'
df = get_data(dow_futures_url,dji_future)

update_pickle(df, pkl_name)

In [27]:
wti_futures_url = 'https://kr.investing.com/commodities/crude-oil-historical-data'
wti_future = ['date', 'wti', 'open', 'high', 'low', 'volume', 'wti_cr']
pkl_name = 'wti_future.pkl'
df = get_data(wti_futures_url,wti_future)

update_pickle(df, pkl_name)

In [28]:
dollar_index_url = 'https://kr.investing.com/currencies/us-dollar-index-historical-data'
dxy_future = ['date', 'dxy', 'open', 'high', 'low', 'volume', 'dxy_cr']
pkl_name = 'dxy_future.pkl'
df = get_data(dollar_index_url,dxy_future)

update_pickle(df, pkl_name)

In [29]:
# def make_datetime_series_df(df, start_date, end_date):
#     # make dataframe with no empty date index with input df.
#     # start_date =  '2021-01-01' str type.
#     # df.index = datetime series
#     # df.columns = ['a', 'b',,,,]
#     date_range_ts = pd.date_range(start=start_date, end=end_date)
#     df_date = pd.DataFrame(columns = df.columns)
#     df_date.insert(0, 'date', date_range_ts)
#     df_date.replace(np.nan, '', inplace=True) # Nan을 빈 칸으로 대체
#     df_date.set_index('date', inplace=True)
#     df_date.update(df)
#     return df_date

#### get append cpi

In [30]:
cpi_url = 'https://www.investing.com/economic-calendar/cpi-733'
cpi_column = ['date', 'time', 'cpi', 'cpi_anticipated', 'cpi_previous', 'none']
pkl_name = 'cpi.pkl'

In [31]:
res = scraper.get(cpi_url, headers=headers)
df = pd.read_html(res.text, flavor=["lxml", "bs4"])[0]
df.columns = cpi_column
df['time'] = df['time'].apply(lambda x : datetime.datetime.strptime(x, "%H:%M").time())
df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x[:12], "%b %d, %Y"))

df = df[['date', 'cpi', 'cpi_anticipated', 'cpi_previous']]

df.sort_values(by=['date'], inplace=True)

update_pickle(df, pkl_name)

#### get append fear and greed
##### 2020년 9월 21일부터  2021년 1월 21일까지 데이터는 이상 데이터 로 나중에 수정해야 함.

In [32]:
import pytz, json

In [33]:
def convert_timestamp_to_date(x):
    dt = datetime.datetime.fromtimestamp(x / 1000, tz=pytz.utc) # UTC에서 변환 불필요.
#     tzone = pytz.timezone('US/Eastern')
#     tzone = pytz.timezone('Asia/Seoul')
#     loc_dt = dt.astimezone(tzone)
    loc_dt = dt
    return loc_dt.date()

In [34]:
# 과거 데이터 (fear_greed_old_to_20200918.pkl) 에 rating column을 추가한 내용
# 한번만 사용하고 이후 사용하지 않음

def convert_to_rating(x):
    if x < 25 :
        rating = 'extreme fear'
    elif x < 45 :
        rating = 'fear'
    elif x < 55 :
        rating = 'neutral'
    elif x < 75 :
        rating = 'greed'
    elif x <= 100 :
        rating = 'extreme greed'

    return rating

In [35]:
today = datetime.date.today()
today_p = today.strftime('%Y%m%d')
diff_days = datetime.timedelta(days=30)
today = today - diff_days
start_date = today.strftime('%Y-%m-%d')  # 30일전부터 자료 수집

In [36]:
url = "https://production.dataviz.cnn.io/index/fearandgreed/graphdata"
pkl_name = 'fear_greed.pkl'
# start_date = '2020-07-15'

In [37]:
r = requests.get("{}/{}".format(url, start_date), headers=headers)
data = r.json()

fg_data = data['fear_and_greed_historical']['data']
df = pd.DataFrame(fg_data)

df.columns = ['date', 'fg_index', 'rating']
df['date'] = df['date'].apply(lambda x: convert_timestamp_to_date(x))
df['fg_index'] = df['fg_index'].apply(lambda x: round(x))

df.sort_values(by=[df.columns[0]], inplace=True)
df.index = np.arange(0, len(df))  # 일련 번호 오름차순으로 재 설정
df.drop_duplicates(subset=['date'], inplace=True) 

In [38]:
update_pickle(df, pkl_name)

  df_o.drop_duplicates(subset=['date'], keep='last', inplace=True)
  df_o.sort_values(by=[df_o.columns[0]], inplace=True)


#### get and append gold price

In [39]:
gold_url = 'https://www.usagold.com/daily-gold-price-history/'
pkl_name = 'gold.pkl'

In [40]:
res = requests.get(gold_url, headers=headers)
df = pd.read_html(res.text, flavor=["lxml", "bs4"])
df = df[0].drop(0) # delete empty first row
df.columns = ['date', 'gold']
df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x, "%d %b %Y"))
df.sort_values(by=['date'], inplace=True)
df.drop_duplicates(subset=['date'], inplace=True) 

In [41]:
update_pickle(df, pkl_name)

### fed 금리 get append

In [42]:
interest_url = 'https://www.investing.com/economic-calendar/interest-rate-decision-168/'
interest_column = ['date', 'time', 'fed_rate', 'fed_rate_fore', 'fed_rate_prev', 'none']
pkl_name = 'fed_rate.pkl'

In [43]:
res = scraper.get(interest_url, headers=headers)
df = pd.read_html(res.text, flavor=["lxml", "bs4"])[0]
df.columns = interest_column

df.replace(np.nan, '', inplace=True)

df['date'] = df['date'].apply(lambda x : datetime.datetime.strptime(x[:12], "%b %d, %Y"))
df['time'] = df['time'].apply(lambda x : datetime.datetime.strptime(x, "%H:%M").time())
df.sort_values(by=['date'], inplace=True)
df.drop_duplicates(subset=['date'], inplace=True) 

In [44]:
update_pickle(df, pkl_name)

### 한국은행 금리 get append

In [45]:
kor_url = 'https://www.bok.or.kr/portal/singl/baseRate/list.do?dataSeCd=01&menuNo=200643'
pkl_name = 'bok_rate.pkl'

In [46]:
res = requests.get(kor_url, headers=headers)
df = pd.read_html(res.text, attrs = {'class': 'fixed'}, flavor=["lxml", "bs4"])[0]

In [47]:
df.columns=  ['release_yr', 'release_date', 'bok_rate']
df_date_temp = df['release_yr'].astype('str')+df['release_date']
df['date'] = df_date_temp.apply(lambda x : datetime.datetime.strptime(x, "%Y%m월 %d일"))
df.sort_values(by=['date'], inplace=True)
df.drop_duplicates(subset=['date'], inplace=True) 

df = df[['date', 'bok_rate']] # leave only valid columns

update_pickle(df, pkl_name)