# 목차
1. Data 가져오기
2. 1차 전처리 - 배추, 무, 감귤 Data groupBy해서, 합쳐주기
3. 2차 전처리 - 품목명(itemname), 상품명(kindname) 을 조절하기
4. 3차 전처리 : column 형식 맞추기
    + goods_price의 팽이버섯 unit price 100/150 해주기
        + goods_id : 20055
5. 4차 전처리 : null price row 지우기
6. goods_price table 만들기

# 1. Data 가져오기

In [2]:
%pyspark

# 사용할 날짜 만들기
import datetime
from dateutil.relativedelta import relativedelta

start = []
end = []
filename=[]


start_date = datetime.date(2021,10,1).strftime('%Y-%m-%d')
start_date = datetime.date(2021,10,1)

for i in range(1,13):
    start.append(start_date.strftime('%Y-%m-%d'))
    filename.append(start_date.strftime('%Y%m%d'))
    start_date = start_date + relativedelta(months=1)
    end_date = start_date - datetime.timedelta(days=1)
    end.append(end_date.strftime('%Y-%m-%d'))
    
print(filename)
    

In [3]:
%pyspark
# 데이터 가져오기
import pandas as pd
result_pdf = pd.read_csv('/nongsanmul_data/20211001.csv', encoding='cp949')

for i in range(1,len(filename)):
    date = filename[i]
    cur_pdf = pd.read_csv(f'/nongsanmul_data/{date}.csv', encoding='cp949')
    result_pdf = pd.concat([result_pdf, cur_pdf])

result_pdf = result_pdf.round({'price':2, 'unit_price':2})
result_sdf = spark.createDataFrame(result_pdf)
result_sdf.show()

In [4]:
%pyspark
result_pdf['itemname'].unique()

In [5]:
%pyspark
result_pdf[result_pdf['itemname']=='포도']

In [6]:
%pyspark
result_pdf['kindname'].unique()

# 2. 1차 전처리 - 배추, 무, 감귤 Data groupBy해서, 합쳐주기

## [1] 배추, 무, 감귤 Data groupBy하기

In [9]:
%pyspark
from pyspark.sql.functions import col, avg, lit
baechu = result_sdf.filter(col('itemname')=='배추')
baechu.orderBy('research_date').show(n=100)

baechu_grouped = baechu.groupBy(['itemname','research_date']).agg(avg(col('price')).alias('price'), avg(col('unit_price')).alias('unit_price')).orderBy('research_date')
baechu_grouped = baechu_grouped.withColumn('kindname',lit('배추(1포기)'))
baechu_grouped = baechu_grouped.select('itemname','kindname','research_date','price', 'unit_price')
baechu_grouped.show(n=100)

In [10]:
%pyspark
mu = result_sdf.filter(col('itemname')=='무')
mu.show(n=100)

mu_grouped = mu.groupBy(['itemname','research_date']).agg(avg(col('price')).alias('price'), avg(col('unit_price')).alias('unit_price')).orderBy('research_date')
mu_grouped = mu_grouped.withColumn('kindname',lit('무(1개)'))
mu_grouped = mu_grouped.select('itemname','kindname','research_date','price', 'unit_price')
mu_grouped.show(n=100)

In [11]:
%pyspark
gamgyul = result_sdf.filter(col('itemname')=='감귤')
gamgyul.show(n=100)

gamgyul_grouped = gamgyul.groupBy(['itemname','research_date']).agg(avg(col('price')).alias('price'), avg(col('unit_price')).alias('unit_price')).orderBy('research_date')
gamgyul_grouped = gamgyul_grouped.withColumn('kindname',lit('감귤(10개)'))
gamgyul_grouped = gamgyul_grouped.select('itemname','kindname','research_date','price', 'unit_price')
gamgyul_grouped.show(n=100)

## [2] 원래 spark df와 합쳐주기 -> 1차 전처리가 완료된 first_sdf 생성
+ 결과물 : first_sdf

In [13]:
%pyspark
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)
    
deleted_sdf = result_sdf.filter( (col('itemname')!='배추') & (col('itemname')!='무') & (col('itemname')!='감귤') )

first_sdf = unionAll(deleted_sdf, baechu_grouped, mu_grouped, gamgyul_grouped)
first_sdf.orderBy('research_date','itemname').show(n=1000)

In [14]:
%pyspark
first_sdf.select('itemname').distinct().show(n=1000)

# 3. 2차 전처리 - 품목명(itemname), 상품명(kindname) 을 조절하기
+ 결과물 : second_sdf_kindname

## [1] 품목명 조절

In [17]:
%pyspark
from pyspark.sql.functions import col, when
first_pdf = first_sdf.toPandas()

second_sdf_itemname = first_sdf.withColumn('itemname', when(col('itemname')=='깐마늘(국산)','깐마늘').otherwise(col('itemname')))
second_sdf_itemname.select('itemname').distinct().show(n=1000)
second_sdf_itemname.orderBy('research_date','itemname').show(n=1000)

## [2] 상품명 조절

In [19]:
%pyspark
second_sdf_kindname =\
second_sdf_itemname.withColumn('kindname', when( col('itemname')=='감자', '수미감자(100g)')\
.when(col('itemname')=='건블루베리', '건블루베리(수입)(100g)' )\
.when(col('itemname')=='건포도', '건포도(수입)(100g)')\
.when(col('itemname')=='고구마', '밤고구마(1kg)' )\
.when(col('itemname')=='고춧가루', '고춧가루(국산)(1kg)')\
.when(col('itemname')=='녹두', '녹두(국산)(500g)')\
.when(col('itemname')=='당근', '무세척당근(1kg)')\
.when( (col('itemname')=='땅콩') & (col('kindname')=='수입(100g)') , '땅콩(수입)(100g)' )\
.when( (col('itemname')=='땅콩') & (col('kindname')=='국산(100g)') , '땅콩(국산)(100g)' )\
.when(col('itemname')=='레몬', '레몬(수입)(10개)')\
.when(col('itemname')=='바나나','바나나(수입)(100g)' )\
.when(col('itemname')=='배', '신고배(10개)')\
.when(col('itemname')=='팥', '붉은팥(국산)(500g)')\
.when(col('itemname')=='상추','적상추(100g)' )\
.when(col('itemname')=='생강', '생강(국산)(1kg)')\
.when(col('itemname')=='쌀', '쌀(일반계)(20kg)')\
.when(col('itemname')=='아몬드', '아몬드(수입)(100g)')\
.when(col('itemname')=='참깨', '백색참깨(국산)(500g)')\
.when(col('itemname')=='참다래','참다래(그린뉴질랜드)(10개)')\
.when(col('itemname')=='찹쌀', '찹쌀(일반계)(1kg)')\
.when(col('itemname')=='파인애플', '파인애플(수입)(1개)')\
.when(col('itemname')=='피망', '청피망(100g)')\
.when(col('itemname')=='호두', '호두(수입)(100g)')\
.when(col('itemname')=='오이', '가시오이(10개)')\
.when(col('itemname')=='사과', '후지사과(10개)')\
.when(col('itemname')=='망고', '망고(수입)(1개)')\
.when(col('itemname')=='오렌지', '오렌지(네이블미국)(10개)')\
.when(col('itemname')=='복숭아', '백도(10개)')\
.when(col('itemname')=='체리', '체리(수입)(100g)')\
.when(col('itemname')=='콩', '흰콩(국산)(500g)')\
.otherwise(col('kindname')))



# second_sdf_kindname.show()
second_sdf_kindname.select('itemname','kindname').distinct().show(n=100)













# 4. 3차 전처리 : column 형식 맞추기
+ 결과물 : third_sdf_business
+ itemname(product_name), kindname(goods_name), research_date,   price, unit_price ---> product_id, product_name, goods_id, goods_name, unit_price, price, research_date, business


## [1] column 명 product_name, goods_name 으로  변경

In [22]:
%pyspark
# product_name, goods_name 으로 컬럼명 변경
third_sdf_name = second_sdf_kindname.withColumnRenamed('itemname','product_name')
third_sdf_name = third_sdf_name.withColumnRenamed('kindname','goods_name')
third_sdf_name.select('goods_name').distinct().show(n=50)



## [2] goods_id column 생성

### (1) goods_id column 생성

In [25]:
%pyspark

third_sdf_goods_id =\
third_sdf_name.withColumn('goods_id'\
,when(col('goods_name')== '후지사과(10개)'              ,20000)\
.when(col('goods_name')== '감귤(10개)'                  ,20001)\
.when(col('goods_name')== '건블루베리(수입)(100g)'      ,20002 )\
.when(col('goods_name')== '건포도(수입)(100g)'          ,20003)\
.when(col('goods_name')== '단감(10개)'                  ,20004)\
.when(col('goods_name')== '레몬(수입)(10개)'            ,20005)\
.when(col('goods_name')== '망고(수입)(1개)'             ,20006)\
.when(col('goods_name')== '바나나(수입)(100g)'          ,20007)\
.when(col('goods_name')== '신고배(10개)'                ,20008)\
.when(col('goods_name')== '백도(10개)'                  ,20009)\
.when(col('goods_name')== '오렌지(네이블미국)(10개)'    ,20010)\
.when(col('goods_name')== '참다래(그린뉴질랜드)(10개)'  ,20012)\
.when(col('goods_name')== '체리(수입)(100g)'            ,20013)\
.when(col('goods_name')== '파인애플(수입)(1개)'         ,20014)\
.when(col('goods_name')== '캠벨얼리(1kg)'               ,20015)\
.when(col('goods_name')== '밤고구마(1kg)'               ,20016)\
.when(col('goods_name')== '녹두(국산)(500g)'            ,20017)\
.when(col('goods_name')== '쌀(일반계)(20kg)'            ,20018)\
.when(col('goods_name')== '찹쌀(일반계)(1kg)'           ,20019)\
.when(col('goods_name')== '흰콩(국산)(500g)'            ,20020)\
.when(col('goods_name')== '붉은팥(국산)(500g)'          ,20021)\
.when(col('goods_name')== '수미감자(100g)'              ,20022)\
.when(col('goods_name')== '고춧가루(국산)(1kg)'         ,20023)\
.when(col('goods_name')== '깐마늘(국산)(1kg)'           ,20024)\
.when(col('goods_name')== '깻잎(100g)'                  ,20025)\
.when(col('goods_name')== '무세척당근(1kg)'             ,20026)\
.when(col('goods_name')== '딸기(100g)'                  ,20027)\
.when(col('goods_name')== '멜론(1개)'                   ,20028)\
.when(col('goods_name')== '무(1개)'                     ,20029)\
.when(col('goods_name')== '방울토마토(1kg)'             ,20030)\
.when(col('goods_name')== '배추(1포기)'                 ,20031)\
.when(col('goods_name')== '붉은고추(100g)'              ,20032)\
.when(col('goods_name')== '적상추(100g)'                ,20033)\
.when(col('goods_name')== '생강(국산)(1kg)'             ,20034)\
.when(col('goods_name')== '수박(1개)'                   ,20035)\
.when(col('goods_name')== '시금치(1kg)'                 ,20036)\
.when(col('goods_name')== '양배추(1포기)'               ,20037)\
.when(col('goods_name')== '양파(1kg)'                   ,20038)\
.when(col('goods_name')== '얼갈이배추(1kg)'             ,20039)\
.when(col('goods_name')== '열무(1kg)'                   ,20040)\
.when(col('goods_name')== '가시오이(10개)'              ,20041)\
.when(col('goods_name')== '참외(10개)'                  ,20042)\
.when(col('goods_name')== '토마토(1kg)'                 ,20043)\
.when(col('goods_name')== '대파(1kg)'                   ,20044)\
.when(col('goods_name')== '파프리카(200g)'              ,20045)\
.when(col('goods_name')== '풋고추(100g)'                ,20046)\
.when(col('goods_name')== '청피망(100g)'                ,20047)\
.when(col('goods_name')== '애호박(1개)'                 ,20048)\
.when(col('goods_name')== '애느타리버섯(100g)'          ,20049)\
.when(col('goods_name')== '땅콩(국산)(100g)'            ,20050)\
.when(col('goods_name')== '땅콩(수입)(100g)'            ,20051)\
.when(col('goods_name')== '새송이버섯(100g)'            ,20052)\
.when(col('goods_name')== '아몬드(수입)(100g)'          ,20053)\
.when(col('goods_name')== '백색참깨(국산)(500g)'        ,20054)\
.when(col('goods_name')== '팽이버섯(150g)'              ,20055)\
.when(col('goods_name')== '호두(수입)(100g)'            ,20056))



# third_sdf_goods_id.show()
third_sdf_goods_id.select('goods_id','goods_name').distinct().orderBy('goods_id').show(n=100)


In [26]:
%pyspark
third_sdf_goods_id.select('goods_id','goods_name').distinct().orderBy('goods_id').show(n=100)

### (2) goodsName_goodsId, goodsId_goodsName pickle 생성 & 저장

In [28]:
%pyspark
goodsNameId

In [29]:
%pyspark

goodsNameId = third_sdf_goods_id.select(['goods_name','goods_id']).distinct().rdd.flatMap(lambda x: x).collect()

goodsId_goodsName = { goodsNameId[i+1] : goodsNameId[i]  for i in range(0,len(goodsNameId),2)}
goodsName_goodsId = { goodsNameId[i] : goodsNameId[i+1]   for i in range(0,len(goodsNameId),2)}


print(goodsName_goodsId)
print(goodsId_goodsName)


In [30]:
%pyspark
import pickle

with open('/nongsanmul_data/nong_goodsName_goodsId.pickle','wb') as fw:
    pickle.dump(goodsName_goodsId, fw)
    
with open('/nongsanmul_data/nong_goodsId_goodsName.pickle','wb') as fw:
    pickle.dump(goodsId_goodsName, fw)

## [3] product_id column 생성
+ 결과물 : third_sdf_product_rename ->  third_sdf_product_id


### (1) product_name 바꾸기
+ 풋고추와 붉은고추 합치기 -> 고추
+ 새송이버섯, 팽이버섯, 느타리버섯 합치기 -> 버섯
+ ~~배추, 얼갈이배추 합치기 -> 배추~~ 취소

In [33]:
%pyspark
from pyspark.sql.functions import col, when
third_sdf_product_rename=\
third_sdf_goods_id.withColumn('product_name' \
, when(col('product_name')=='풋고추', '고추').when(col('product_name')=='붉은고추', '고추')\
.when(col('product_name').like('%버섯'), '버섯')\
# .when(col('product_name')=='얼갈이배추', '배추')\
.otherwise(col('product_name')))

third_sdf_product_rename.select('product_name','goods_name').distinct().show(n=100)

### (2) {product_id : product_name}, {product_name : product_id} 생성 & 저장

In [35]:
%pyspark
### 생성
product_name_lst = third_sdf_product_rename.select('product_name').distinct().rdd.flatMap(lambda x: x).collect()

productName_productId = {}
productId_productName = {}
productId = 1000  #초기값
for i in range(len(product_name_lst)):
    productName = product_name_lst[i]
    productId = 1000 + i
    productName_productId[productName] = productId
    productId_productName[productId] = productName

print(productName_productId)
print(productId_productName)


### 저장
import pickle
with open('/nongsanmul_data/nong_productName_productId.pickle','wb') as fw:
    pickle.dump(productName_productId, fw)

with open('/nongsanmul_data/nong_productId_productName.pickle','wb') as fw:
    pickle.dump(productId_productName, fw)


### (3) product_id column 생성

In [38]:
%pyspark
### pickle load
import pickle
with open('/nongsanmul_data/nong_productName_productId.pickle','rb') as fr:
    productName_productId = pickle.load(fr)

### product_id column 생성
from pyspark.sql.functions import col, lit
third_sdf_product_id = third_sdf_product_rename
third_sdf_product_id = third_sdf_product_id.withColumn('product_id',lit(999999))
for product_name in productName_productId:
    third_sdf_product_id = third_sdf_product_id.withColumn('product_id', when(col('product_name')==product_name, productName_productId[product_name] ).otherwise(col('product_id')) )

third_sdf_product_id.select('product_name','product_id').distinct().show(n=100)

### (4) goodsId_productId.pickle & productId_goodsId.pickle 생성& 저장

In [40]:
%pyspark
productId_goodsId_pdf = third_sdf_product_id.select('product_id','goods_id').distinct().toPandas()

In [41]:
%pyspark
### 생성
goodsId_productId = {}
productId_goodsId = {}

for i in range(len(productId_goodsId_pdf)):
    product_id = productId_goodsId_pdf.loc[i,'product_id']
    goods_id = productId_goodsId_pdf.loc[i,'goods_id']
    goodsId_productId[goods_id] = product_id
    
    if product_id in productId_goodsId:
        productId_goodsId[product_id].append(goods_id)
    else:
        productId_goodsId[product_id] =[goods_id]

print(goodsId_productId)
print(productId_goodsId)

In [42]:
%pyspark
### 저장
import pickle
with open('/nongsanmul_data/nong_goodsId_productId.pickle','wb') as fw:
    pickle.dump(goodsId_productId, fw)

with open('/nongsanmul_data/nong_productId_goodsId.pickle','wb') as fw:
    pickle.dump(productId_goodsId, fw)

## [4] business column 생성
+ 결과물 : third_sdf_business

In [44]:
%pyspark
from pyspark.sql.functions import col, lit
third_sdf_business = third_sdf_product_id
third_sdf_business = third_sdf_business.withColumn('business', lit('m'))
third_sdf_business.show()

In [45]:
%MySQL
select * from category


# 5. 4차 전처리 : null price row 지우기 

In [47]:
%pyspark
forth_sdf_notNull = third_sdf_business
forth_sdf_notNull = forth_sdf_notNull.dropna(how='any', subset=['price','unit_price'])
forth_sdf_notNull.show()

In [48]:
%pyspark
from pyspark.sql.functions import col, isnan
forth_sdf_notNull.filter(isnan(col('price'))).show()

forth_sdf_notNull.filter(col('price').isNull()).show()

# 6.  goods_price table 만들기
+ 팽이버섯의 unit_price 바꾸기 : 150g 기준을 100g 기준으로 바꾸기
+ column 맞추고
+ 2021-10-01 ~ 2022-09-29 까지 값 채우기 

## [1] 전처리 결과물 가져오기 & 팽이버섯의 unit_price 바꾸기 : 150g 기준을 100g 기준으로 바꾸기

In [51]:
%pyspark
import pandas as pd
afterPreprocessing_pdf = forth_sdf_notNull.toPandas()
afterPreprocessing_pdf

In [52]:
%pyspark
# 가격 줄이기
# afterPreprocessing_pdf.loc[afterPreprocessing_pdf['goods_name'].str.contains('팽이버섯'),'unit_price'] *= (10/15)
afterPreprocessing_pdf[afterPreprocessing_pdf['goods_name'].str.contains('팽이버섯')]

## [2] 함수 정의
+ input df의 column
    + goods_id, unit_price,  price, research_date, business로 구성되어 있음
+ 제약사항
    + business 값이 모두 같은 경우에만 정상작동함 


In [54]:
%pyspark
# 코드 줄인 버전

import datetime
import numpy as np
import pandas as pd
# 초기세팅
# tprice_goods_ssm_price_table.reset_index(drop=True, inplace=True)
# tprice_goods_ssm_price_table = pd.read_csv('/DB_data/참가격_주데이터_상품_가격_SSM.csv')
# start_date = datetime.date(2021,9,7)
# end_date = datetime.date(2022,9,30)






# row 형식으로 된, 일데이터 주기가 아닌 데이터를, 일데이터 형식으로 바꾸기
def inserting_date(start_date, end_date, input_df, business): # Pandas DF를 이용
    print('debug1')
    
    #---------------------------------------------------------------------------------------------------------------------------------#
    #                 1. 초기작업 : 결과 df 생성, 초기값들 설정
    #---------------------------------------------------------------------------------------------------------------------------------#

    
    # 초기 설정 값
    start_goods_id = min((input_df['goods_id']))
    # print('d1')
    end_goods_id = max((input_df['goods_id']))
    # print('d2')
    cur_goods_id = start_goods_id - 1
    # print('d3')
    cur_research_date = start_date
    # print('debug2')
    
    
    # goods_id, research_date 기준으로 sorting 시키기
    input_df = input_df.reset_index(drop=True)
    input_df = input_df.sort_values(['goods_id','research_date'])
    # 추후에 삭제할 행
    input_df.loc[len(input_df)] = [end_goods_id+1, 0, 0 ,start_date.strftime("%Y-%m-%d"), business]
    # print('debug3')
    
    # result
    result_df = pd.DataFrame(columns=input_df.columns)
    inserting_length = 0
    
    
    
    input_df_length = 0
    # input_df의 끝까지 가지 않은 경우, 계속하기
    ## 1. goods_id 가 달라지는 경우, research_date가 일치하면, input_df_length를 1 늘리기
    while input_df_length < len(input_df): # dummy row까지 쭉 진행,
    
        next_goods_id, next_unit_price, next_price, next_research_date, next_business = input_df.loc[input_df_length]   # input_df의 다음(목표) row
        # inserting_research_date = cur_research_date                                                    # input_df의 현재 row
    
        
        # goods_id가 같은 경우
        if next_goods_id == cur_goods_id:
            ## input_df의 next row에 도달한 경우, 도달한 값 넣어주기
    
    

            # print('debug 3')
            ## 현재 inserting 날짜가, next_research_date에 도달하기 전까지 , 계속 row 삽입하기
            while next_research_date != inserting_research_date.strftime("%Y-%m-%d"):
                # print(inserting_research_date.strftime("%Y-%m-%d"), '--------------------')
                result_df.loc[inserting_length] = cur_goods_id, cur_unit_price, cur_price, inserting_research_date.strftime("%Y-%m-%d"),  cur_business
                
                
                cur_research_date = inserting_research_date                        # cur_research_date 업데이트
                

                inserting_research_date +=  datetime.timedelta(days=1) # inserting할 날짜 늘리기
                inserting_length += 1   # result_df 길이 추가
                
            cur_goods_id = next_goods_id                                       # cur_goods_id 업데이트
            cur_unit_price = next_unit_price
            cur_price = next_price                                             # cur_price 업데이트
            cur_business = next_business                                       # cur_business 업데이트
            input_df_length += 1
                
    
           
    
    
        # 새로운 goods_id를 넣는 경우, 새로 진행
        elif next_goods_id != cur_goods_id:
            print(cur_goods_id)
            # print(result_df)
            
            ## end_date 까지 row 채워주기
            if cur_goods_id != start_goods_id - 1:  # 처음 시작하는 경우가 아니라면,
                while end_date + datetime.timedelta(days=1) > inserting_research_date:
                    result_df.loc[inserting_length] = cur_goods_id, cur_unit_price,cur_price, inserting_research_date.strftime("%Y-%m-%d"),  cur_business
                    inserting_research_date +=  datetime.timedelta(days=1) # inserting할 날짜 늘리기
                    inserting_length += 1   # result_df 길이 추가
            
            ## input_df의 마지막까지 진행됐다면 while문 break해서 끝내기
            if input_df_length == len(input_df) - 1:
                print('here')
                break
            
            ## inserting_research_date 초기화
            inserting_research_date = start_date
            
            ## 만약 첫 row가 start_date일이 아니면, 채워주기
            inserting_unit_price = next_unit_price
            inserting_price = next_price  # next_price의 가격을 앞에 채워주기
            ### 다음 row의 날짜에 도달하기 전까지 계속 추가해 주기
            while next_research_date != inserting_research_date.strftime("%Y-%m-%d"):
                result_df.loc[inserting_length] = next_goods_id, next_unit_price, inserting_price, inserting_research_date.strftime("%Y-%m-%d"), next_business
                # print(inserting_research_date.strftime("%Y-%m-%d"),'=========================================')
                
                cur_research_date = inserting_research_date                        # cur_research_date 업데이트
                
                
                inserting_research_date +=  datetime.timedelta(days=1) # inserting할 날짜 늘리기
                inserting_length += 1   # result_df 길이 추가
            
            input_df_length += 1    # input_df 길이 추가
            cur_goods_id = next_goods_id                                       # cur_goods_id 업데이트
            cur_unit_price = next_unit_price
            cur_price = next_price                                             # cur_price 업데이트
            
            cur_business = next_business                                       # cur_business 업데이트
            
    return result_df

## [3] 값 채워넣기

In [56]:
%pyspark

import datetime
import pandas as pd

# 초기세팅
start_date = datetime.date(2021,9,7)
end_date = datetime.date(2022,9,29)


nong_goods_price_before = afterPreprocessing_pdf[['goods_id', 'unit_price', 'price', 'research_date', 'business']]
nong_goods_price_before = nong_goods_price_before.sort_values(['goods_id','research_date']).reset_index(drop=True)
nong_goods_price_after = inserting_date(start_date, end_date, nong_goods_price_before, business='m' )

In [57]:
%pyspark
pd.set_option('display.max_rows', None)
nong_goods_price_before.tail(300)

In [58]:
%pyspark
nong_goods_price_after.tail(500)

## [4] [base] 농산물 goods_price table 저장하기


In [60]:
%pyspark
nong_goods_price_after.to_csv('/nongsanmul_data/from20210907_to20220929_nongsanmul_daily_base.csv', header=True, index=False)

In [61]:
%MySQL
SELECT * from goods_price
