### 데이터 병합
* 코인 데이터 : bit.csv
* 코로나 데이터
    * 전체 : corona.csv
    * 주말 제거 : corona_coin.csv
* 주식 데이터 : kospi.csv
* 뉴스 데이터
    * 전체 : news_all_data.csv
    * 5개 컬럼 : news_data_five_columns.csv
        * 날짜, 제목, 키워드, 특성추출(가중치순 상위 50개), 본문
    * 2개 컬럼 : news_title.csv
        * 날짜, 제목
    * 날짜별 뉴스 개수 : news_cnt.csv
        * 날짜, 뉴스 개수
* 결과 데이터
    * 코로나(전체), 주식, 코인, 뉴스 병합된 csv

In [2]:
import pandas as pd
from datetime import datetime

In [4]:
corona = pd.read_csv("./CSV/01_corona.csv")
corona_without_weekend = pd.read_csv("./CSV/02_corona_without_weekend.csv")
bitcoin = pd.read_csv("./CSV/07_bitcoin.csv")
kospi = pd.read_csv("./CSV/09_kospi.csv")
news_cnt = pd.read_csv("./CSV/06_news_cnt.csv")

In [5]:
print("corona shape :", corona.shape)
print("corona_without_weekend shape :", corona_without_weekend.shape)
print("bitcoin shape :", bitcoin.shape)
print("kospi shape :", kospi.shape)
print("news_cnt shape :", news_cnt.shape)

corona shape : (590, 2)
corona_without_weekend shape : (422, 2)
bitcoin shape : (590, 2)
kospi shape : (400, 2)
news_cnt shape : (590, 2)


In [6]:
print(corona.info())
print("=" * 50)
print(corona_without_weekend.info())
print("=" * 50)
print(bitcoin.info())
print("=" * 50)
print(kospi.info())
print("=" * 50)
print(news_cnt.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   날짜       590 non-null    object
 1   코로나 확진자  590 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 9.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   날짜       422 non-null    object
 1   코로나 확진자  422 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   날짜      590 non-null    object 
 1   비트코인    590 non-null    float64
dtypes: float64(1), object(1)
memory usage: 9.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data 

### 날짜 데이터 타입 변경

In [9]:
corona["날짜"] = pd.to_datetime(corona["날짜"], format = "%Y-%m-%d")
corona_without_weekend["날짜"] = pd.to_datetime(corona_without_weekend["날짜"], format = "%Y-%m-%d")
bitcoin["날짜"] = pd.to_datetime(bitcoin["날짜"], format = "%Y-%m-%d")
kospi["날짜"] = pd.to_datetime(kospi["날짜"], format = "%Y-%m-%d")
news_cnt["날짜"] = pd.to_datetime(news_cnt["날짜"], format = "%Y-%m-%d")

In [10]:
print(corona.info())
print("=" * 50)
print(corona_without_weekend.info())
print("=" * 50)
print(bitcoin.info())
print("=" * 50)
print(kospi.info())
print("=" * 50)
print(news_cnt.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   날짜       590 non-null    datetime64[ns]
 1   코로나 확진자  590 non-null    int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 9.3 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   날짜       422 non-null    datetime64[ns]
 1   코로나 확진자  422 non-null    int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 6.7 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      590 non-null    datetime64[ns]
 1   비트코인    590 non-null    float64       
dtypes: datetime64[ns](1), f

### corona, bitcoin merge

In [13]:
# result_df = corona.merge(bitcoin, left_on = "일자", right_on = "일자")
data_merge_df = pd.merge(corona, bitcoin, on = "날짜")
data_merge_df.head(3)

Unnamed: 0,날짜,코로나 확진자,비트코인
0,2020-01-20,1,9884000.0
1,2020-01-21,0,9878000.0
2,2020-01-22,0,9887000.0


In [14]:
data_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590 entries, 0 to 589
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   날짜       590 non-null    datetime64[ns]
 1   코로나 확진자  590 non-null    int64         
 2   비트코인     590 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 18.4 KB


### data_merge_df, kospi merge

In [15]:
# result_df = pd.merge(corona, bitcoin, on = "일자")
data_merge_df = data_merge_df.merge(kospi, left_on = "날짜", right_on = "날짜", how = "outer")
data_merge_df.head(3)

Unnamed: 0,날짜,코로나 확진자,비트코인,코스피
0,2020-01-20,1,9884000.0,2262.639893
1,2020-01-21,0,9878000.0,2239.689941
2,2020-01-22,0,9887000.0,2267.25


In [16]:
data_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590 entries, 0 to 589
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   날짜       590 non-null    datetime64[ns]
 1   코로나 확진자  590 non-null    int64         
 2   비트코인     590 non-null    float64       
 3   코스피      400 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 23.0 KB


In [17]:
data_merge_df.head(20)

Unnamed: 0,날짜,코로나 확진자,비트코인,코스피
0,2020-01-20,1,9884000.0,2262.639893
1,2020-01-21,0,9878000.0,2239.689941
2,2020-01-22,0,9887000.0,2267.25
3,2020-01-23,0,9661000.0,2246.129883
4,2020-01-24,1,9830000.0,
5,2020-01-25,0,9692000.0,
6,2020-01-26,1,9841000.0,
7,2020-01-27,1,10050000.0,
8,2020-01-28,0,10282000.0,2176.719971
9,2020-01-29,0,10651000.0,2185.280029


### data_merge_df, news_df merge

In [18]:
# result_df["news_cnt"] = news_data["일자"].value_counts(sort = False)
# result_df
data_merge_df = data_merge_df.merge(news_cnt, on = "날짜")
data_merge_df.head(20)

Unnamed: 0,날짜,코로나 확진자,비트코인,코스피,뉴스 개수
0,2020-01-20,1,9884000.0,2262.639893,273
1,2020-01-21,0,9878000.0,2239.689941,432
2,2020-01-22,0,9887000.0,2267.25,563
3,2020-01-23,0,9661000.0,2246.129883,627
4,2020-01-24,1,9830000.0,,473
5,2020-01-25,0,9692000.0,,374
6,2020-01-26,1,9841000.0,,577
7,2020-01-27,1,10050000.0,,1109
8,2020-01-28,0,10282000.0,2176.719971,3190
9,2020-01-29,0,10651000.0,2185.280029,2917


In [19]:
data_merge_df.tail(20)

Unnamed: 0,날짜,코로나 확진자,비트코인,코스피,뉴스 개수
570,2021-08-12,1987,51612000.0,3208.379883,3307
571,2021-08-13,1989,53955000.0,3171.290039,2641
572,2021-08-14,1928,54570000.0,,754
573,2021-08-15,1816,53700000.0,,1018
574,2021-08-16,1554,53763000.0,,1863
575,2021-08-17,1372,53749000.0,3143.090088,3040
576,2021-08-18,1803,53212000.0,3158.929932,2878
577,2021-08-19,2152,53514000.0,3097.830078,2767
578,2021-08-20,2050,56822000.0,3060.51001,2247
579,2021-08-21,1877,57532000.0,,682


### CSV 파일로 저장
* index 제거

In [20]:
data_merge_df.to_csv("./CSV/11_data_merge.csv", index = False)