# 크롤링

In [1]:
# 라이브러리 불러오기
import yfinance as yf
import pandas as pd
import cloudscraper

In [2]:
def convert(value):  # 끝이 M, K일때 숫자로 변환
    if isinstance(value, str):  # 입력값이 문자열인 경우만 처리
        if "K" in value:
            return float(value.replace("K", "")) * 1000
        if "M" in value:
            return float(value.replace("M", "")) * 1000000
    return value

In [3]:
def update_data(ticker_name):
    if ticker_name == "^VNINDEX.VN":
        scraper = cloudscraper.create_scraper()
        html = scraper.get(
            "https://www.investing.com/indices/vn-historical-data"
        ).content
        dfs = pd.read_html(html)

        if len(dfs) > 0:
            df = dfs[0]
        else:
            print("데이터를 찾을 수 없습니다.")
            return None

        df = df.drop(columns=["Change %"])
        df = df.fillna(0)
        df = df.rename(columns={"Vol.": "Volume", "Price": "Close"})
        df["Volume"] = df["Volume"].apply(convert)
        df["Date"] = pd.to_datetime(df["Date"], format="%m/%d/%Y")
        df.set_index("Date", inplace=True)

        data = pd.read_csv(ticker_name + ".csv", index_col="Date")
        data.index = pd.to_datetime(data.index)
        data = data.fillna(0)
        merged_data = pd.concat([data, df])

        for col in ["Open", "High", "Close", "Low"]:
            merged_data[col] = pd.to_numeric(
                merged_data[col].astype(str).str.replace(",", ""), errors="coerce"
            )

        # Remove duplicates if any.
        merged_data = merged_data.loc[~merged_data.index.duplicated(keep="first")]

        # Sort the dataframe based on date.
        merged_data.sort_index(inplace=True)
        merged_data.index = merged_data.index.astype(str)
        merged_data = merged_data.fillna(0)
        # Save to csv.
        merged_data.to_csv(ticker_name + ".csv", index=True)

    else:
        data = yf.download(ticker_name)
        data = data.sort_values(by="Date", ascending=True)
        data = data.drop(columns=["Adj Close"])
        data.to_csv(ticker_name + ".csv", encoding="utf-8-sig")
        data.index = data.index.astype(str)


In [4]:
def output_data(ticker_name):
    data = pd.read_csv(ticker_name+'.csv')
    data = data.set_index('Date')
    if data['Open'].iloc[-1] == 0:
        data = data[:-1]
    return data

In [5]:
update_data("^KS11")
output_data('^KS11')

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1996-12-11,705.989990,709.479980,704.429993,704.679993,28000
1996-12-12,705.109985,706.010010,688.739990,689.380005,25900
1996-12-13,690.440002,695.719971,677.640015,689.070007,26500
1996-12-16,686.969971,686.969971,667.710022,673.919983,22800
1996-12-17,675.349976,680.090027,660.390015,663.349976,31600
...,...,...,...,...,...
2024-01-03,2643.540039,2643.719971,2607.310059,2607.310059,463100
2024-01-04,2592.439941,2602.639893,2580.090088,2587.020020,770200
2024-01-05,2586.889893,2592.290039,2572.600098,2578.080078,520500
2024-01-08,2584.229980,2591.679932,2566.340088,2567.820068,320100


In [6]:
update_data("055550.KS")
output_data('055550.KS')

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-04,18688.708984,20234.212891,18308.277344,19259.357422,197145699
2000-01-05,19021.585938,19592.234375,18070.507812,18664.931641,186396333
2000-01-06,18807.593750,19497.126953,16406.119141,17119.427734,184079807
2000-01-07,18546.046875,18878.923828,17618.744141,18070.507812,256406581
2000-01-10,18783.816406,19211.802734,18546.046875,18926.478516,988218404
...,...,...,...,...,...
2024-01-03,38800.000000,39200.000000,37900.000000,37900.000000,931521
2024-01-04,37950.000000,38000.000000,37050.000000,37300.000000,887191
2024-01-05,37100.000000,37300.000000,36650.000000,36850.000000,671689
2024-01-08,37100.000000,37350.000000,36750.000000,36950.000000,356649


In [7]:
update_data("^VNINDEX.VN")
output_data('^VNINDEX.VN')

Unnamed: 0_level_0,Close,Open,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-07-31,101.55,101.55,101.55,101.55,10.0
2000-08-02,103.38,103.38,103.38,103.38,0.0
2000-08-04,105.20,105.20,105.20,105.20,0.0
2000-08-07,106.92,106.92,106.92,106.92,10.0
2000-08-09,108.64,108.64,108.64,108.64,20.0
...,...,...,...,...,...
2024-01-03,1144.17,1131.72,1144.17,1128.32,719930.0
2024-01-04,1147.43,1145.49,1148.93,1144.32,248380.0
2024-01-05,1154.68,1150.72,1155.84,1149.08,803270.0
2024-01-08,1160.19,1154.68,1162.56,1154.68,905410.0


In [8]:
update_data("VCB.VN")
output_data('VCB.VN')

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-06-30,22898.183594,22898.183594,22898.183594,22898.183594,770543
2009-07-01,24043.093750,24043.093750,22707.365234,23089.001953,16372619
2009-07-02,22707.365234,22898.183594,21944.093750,22134.912109,3971498
2009-07-03,21562.457031,21753.275391,21371.638672,21371.638672,2357526
2009-07-06,21371.638672,22325.730469,21371.638672,22325.730469,4118416
...,...,...,...,...,...
2024-01-03,83500.000000,84500.000000,82800.000000,84500.000000,1521100
2024-01-04,84500.000000,86200.000000,84000.000000,85900.000000,2734900
2024-01-05,85900.000000,86200.000000,85700.000000,86200.000000,1328400
2024-01-08,86300.000000,86800.000000,86300.000000,86800.000000,1652800


### 환율(USD_VND)

#usd_vnd1 = pd.read_csv('USD_VND Historical Data.csv')
#usd_vnd2 = pd.read_csv('USD_VND Historical Data (1).csv')
#usd_vnd = pd.concat([usd_vnd1,usd_vnd2])
#usd_vnd['Date'] = pd.to_datetime(usd_vnd['Date'], format='%m/%d/%Y').dt.date
#usd_vnd = usd_vnd.sort_values('Date')
#usd_vnd = usd_vnd.set_index('Date')
#usd_vnd.to_csv('usd_vnd.csv', index=True, encoding='utf-8-sig')

get_data_investing_exchange('https://www.investing.com/currencies/usd-vnd-historical-data', 'usd_vnd')

### 환율(USD_KRW)

#usd_krw1 = pd.read_csv('USD_KRW Historical Data.csv')
#usd_krw2 = pd.read_csv('USD_KRW Historical Data (1).csv')
#usd_krw = pd.concat([usd_krw1,usd_krw2])
#usd_krw['Date'] = pd.to_datetime(usd_krw['Date'], format='%m/%d/%Y').dt.date
#usd_krw = usd_krw.sort_values('Date')
#usd_krw = usd_krw.set_index('Date')
#usd_krw.to_csv('usd_krw.csv', index=True, encoding='utf-8-sig')

get_data_investing_exchange('https://www.investing.com/currencies/usd-krw-historical-data', 'usd_krw')

## 출력

In [9]:
def country_output(ticker_name):
    data = pd.read_csv(ticker_name + ".csv").dropna()
    data.index = pd.to_datetime(data["Date"])

    output = {}

    output["previous_close"] = "{:,.2f}".format(data.iloc[-1]["Close"])
    output["today_open"] = "{:,.2f}".format(data.iloc[-1]["Open"])
    output["today_volume"] = "{:,.2f}".format(data.iloc[-1]["Volume"])

    recent_data = data[-50:]
    recent_data['Volume'].mean()
    output["avg_volume"] = "{:,.0f}".format(recent_data["Volume"].mean())

    last_row = data.iloc[-1]
    today_range = (
        "{:,.2f}".format(last_row["Low"]) + " - " + "{:,.2f}".format(last_row["High"])
    )
    output["range_days"] = today_range

    last_year_data = data.last("52W")
    year_range = (
        "{:,.2f}".format(last_year_data["Low"].min())
        + " - "
        + "{:,.2f}".format(last_year_data["High"].max())
    )
    output["range_52"] = year_range

    output["MA_50"] = round(data["Close"].rolling(window=50).mean().iloc[-1], 1)
    output["MA_200"] = round(data["Close"].rolling(window=200).mean().iloc[-1], 1)

    return output

In [10]:
def group_output(ticker_name):
    data = pd.read_csv(ticker_name + ".csv").dropna()
    data.index = pd.to_datetime(data["Date"])

    output = {}

    output["previous_close"] = "{:,.2f}".format(data.iloc[-1]["Close"])
    output["today_open"] = "{:,.2f}".format(data.iloc[-1]["Open"])
    output["today_volume"] = "{:,.2f}".format(data.iloc[-1]["Volume"])
    
    recent_data = data[-50:]
    recent_data['Volume'].mean()
    output["avg_volume"] = "{:,.0f}".format(recent_data["Volume"].mean())

    last_row = data.iloc[-1]
    today_range = (
        "{:,.2f}".format(last_row["Low"]) + " - " + "{:,.2f}".format(last_row["High"])
    )
    output["range_days"] = today_range

    last_year_data = data.last("52W")
    year_range = (
        "{:,.2f}".format(last_year_data["Low"].min())
        + " - "
        + "{:,.2f}".format(last_year_data["High"].max())
    )
    output["range_52"] = year_range

    output["MA_50"] = round(data["Close"].rolling(window=50).mean().iloc[-1], 1)
    output["MA_200"] = round(data["Close"].rolling(window=200).mean().iloc[-1], 1)

    tick = yf.Ticker(ticker_name)

    output["Beta"] = tick.info["beta"]

    num = tick.info["enterpriseValue"]
    if num >= 10**12:  # 천억 이상
        output["enterprise"] = str(round(num / 10**12, 2)) + "T"
    elif num >= 10**8:  # 백만 이상
        output["enterprise"] = str(round(num / 10**8, 2)) + "M"
    else:
        output["enterprise"] = str(num)

    output["Buy"] = tick.info["bid"]
    output["Sell"] = tick.info["ask"]

    return output

In [11]:
country_output("^KS11")

  last_year_data = data.last("52W")


{'previous_close': '2,561.24',
 'today_open': '2,598.31',
 'today_volume': '791,214.00',
 'avg_volume': '447,042',
 'range_days': '2,556.00 - 2,599.37',
 'range_52': '2,273.97 - 2,675.80',
 'MA_50': 2501.6,
 'MA_200': 2524.9}

In [12]:
country_output("^VNINDEX.VN")

  last_year_data = data.last("52W")


{'previous_close': '1,158.59',
 'today_open': '1,160.19',
 'today_volume': '870,510.00',
 'avg_volume': '774,036',
 'range_days': '1,155.46 - 1,162.83',
 'range_52': '1,013.37 - 1,255.11',
 'MA_50': 1130.7,
 'MA_200': 1118.6}

In [13]:
group_output("055550.KS")

  last_year_data = data.last("52W")


{'previous_close': '37,250.00',
 'today_open': '37,450.00',
 'today_volume': '388,703.00',
 'avg_volume': '791,931',
 'range_days': '36,800.00 - 37,500.00',
 'range_52': '32,400.00 - 44,900.00',
 'MA_50': 36813.0,
 'MA_200': 35523.2,
 'Beta': 0.621,
 'enterprise': '65.57T',
 'Buy': 37300.0,
 'Sell': 37350.0}

In [14]:
group_output("VCB.VN")

  last_year_data = data.last("52W")


{'previous_close': '87,800.00',
 'today_open': '86,800.00',
 'today_volume': '2,074,500.00',
 'avg_volume': '1,549,419',
 'range_days': '86,800.00 - 87,900.00',
 'range_52': '72,057.58 - 93,400.00',
 'MA_50': 85244.0,
 'MA_200': 84131.3,
 'Beta': 0.769,
 'enterprise': '166.96T',
 'Buy': 87800.0,
 'Sell': 87900.0}

In [15]:
def output_news():
    data = pd.read_excel('market.xlsx')
    data = data.set_index('Date')
    return data

In [16]:
output_news()

Unnamed: 0_level_0,Korean,English,Vietnamese
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-03-10,"코스닥 지수 사상 최고치 기록(2,834.40P)","Record high of KOSDAQ index (2,834.40P)","Chỉ số KOSDAQ ghi nhận mức cao nhất (2,834.40P)"
2001-08-23,IMF 관리체제 탈피,Escape from IMF management system,Thoát khỏi hệ thống quản lý của IMF
2001-09-11,미국 911 테러,US 911 Terror,Khủng bố 911 tại Mỹ
2002-10-14,ETF 시장 개설,Opening of the ETF market,Mở thị trường ETF
2003-03-20,미국.이라크 전쟁,US-Iraq War,Chiến tranh Mỹ-Iraq
...,...,...,...
2023-09-23,항저우 아시안 게임 개최,Hangzhou Asian Games held,Tổ chức Đại hội thể thao châu Á tại Hàng Châu
2023-10-07,"하마스, 이스라엘 전쟁","Hamas, war with Israel","Hamas, chiến tranh với Israel"
2023-11-05,주식 공매도 금지 실시,Implementation of short selling ban on stocks,Thực thi cấm bán cổ phiếu khống
2023-11-17,국가 행정망 전산 마비 사태,National administrative network computer failure,Sự cố hỏng hệ thống máy tính mạng quản lý quốc...


In [17]:
def output_learn():
    data = pd.read_excel("learn.xlsx")
    data = data.set_index('Indicators')
    return data

output_learn()

Unnamed: 0_level_0,Korea,English,Vietnam
Indicators,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bollinger Bands,주가가 상대적으로 높은지 낮은지를 주가의 변동성을 측정하는 기법,Technique of measuring whether the stock price...,Kỹ thuật đo biến động giá cổ phiếu để xác định...
Price Channel,"특정 기간 동안의 최고가와 최저가를 연결한 선으로, 주가의 상승세와 하락세를 판단하...",Line connecting the highest and lowest prices ...,Sử dụng đường nối giữa giá cao nhất và thấp nh...
Linear Fit,주가의 추세를 선형 방정식으로 표현하는 방법,Method of representing the trend of stock pric...,Phương pháp biểu diễn xu hướng giá cổ phiếu bằ...
Quadratic Fit,주가의 추세를 2차 방정식으로 표현하는 방법,Method of representing the trend of stock pric...,Phương pháp biểu diễn xu hướng giá cổ phiếu bằ...
Cubic Fit,주가의 추세를 3차 방정식으로 표현하는 방법,Method of representing the trend of stock pric...,Phương pháp biểu diễn xu hướng giá cổ phiếu bằ...
Quartic Fit,주가의 추세를 4차 방정식으로 표현하는 방법,Method of representing the trend of stock pric...,Phương pháp biểu diễn xu hướng giá cổ phiếu bằ...
Quintic Fit,주가의 추세를 5차 방정식으로 표현하는 방법,Method of representing the trend of stock pric...,Phương pháp biểu diễn xu hướng giá cổ phiếu bằ...
Logarithmic Fit,주가의 추세를 로그 함수로 표현하는 방법,Method of representing the trend of stock pric...,Phương pháp biểu diễn xu hướng giá cổ phiếu bằ...
Exponential Fit,주가의 추세를 지수 함수로 표현하는 방법,Method of representing the trend of stock pric...,Phương pháp biểu diễn xu hướng giá cổ phiếu bằ...
PowerLaw Fit,주가의 추세를 거듭제곱 함수로 표현하는 방법,Method of representing the trend of stock pric...,Phương pháp biểu diễn xu hướng giá cổ phiếu bằ...
