In [1]:
import os
import re
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from sklearn.utils import shuffle

def getDataFromFiles(filefolder, category1 = "*", category2= "*", error_del = False, min_len=100, drop_duplicates=False, maxrow_by_category = 0) :
    if category1 == "*" : 
        category1 = set([re.search('_.*_', str(filename)).group().replace("_","") for filename in Path(filefolder).rglob("*.json")])
    else : 
        category1 = category1.split(",")
        
    if category2 == "*" : 
        category2 = set([re.search('_.*\.json', str(filename)).group().replace(".json","").split("_")[2] for filename in Path(filefolder).rglob("*.json")])
    else : 
        category2 = category2.split(",")
    
    df_li = []
    for cate1 in category1 :
        for cate2 in category2 :
            pattern = "*_{}_{}.json".format(cate1, cate2)
            
            tmp_df_li = []
            for path in Path(filefolder).rglob(pattern):
                try : 
                    tmp_df = pd.read_json(open(path, "r", encoding="utf8"))
                    tmp_df = tmp_df[tmp_df['content'].apply(len) > min_len] #지정 길이 이상의 데이터만 조회
                    tmp_df = tmp_df[tmp_df['category1'].apply(len) < 20]
                    tmp_df = tmp_df[tmp_df['category2'].apply(len) < 20]
                    if drop_duplicates : #중복 뉴스 삭제
                        tmp_df = tmp_df.drop_duplicates(subset='content', keep='first')
                    tmp_df_li.append(tmp_df)
                except Exception as e:
                    if error_del : os.remove(path) #오류가 있는 파일 삭제. 재작업 목적
                    print("Error file : {}".format(path))
                    print("Error message : {}".format(str(e)))
            if len(tmp_df_li) > 0 :                
                # 세분류별 데이터 건수 제한
                tmp_df = pd.concat(tmp_df_li, sort=False)
                print("-- Parttern : {} [{}] --".format(pattern, tmp_df.shape[0]))
                
                if (tmp_df.shape[0] > 0) :
                    if maxrow_by_category > 0 : tmp_df = shuffle(tmp_df)[:maxrow_by_category]
                    df_li.append(tmp_df)    
                    
    df = pd.concat(df_li, sort=False)
    print("{} rows returned".format(df.shape[0]))
    return df

df = pd.DataFrame()
df = getDataFromFiles("../../NaverNews/navernews/news/news/", category1='*', category2='*'
                      , min_len=100, drop_duplicates=True, error_del = False)
df.to_csv('./data/전체_1.csv', index=False)

-- Parttern : *_정치_북한.json [88930] --
-- Parttern : *_정치_국회정당.json [301342] --
-- Parttern : *_정치_행정.json [79279] --
-- Parttern : *_정치_국방외교.json [118892] --
Error file : ../../NaverNews/navernews/news/news/2018/03/16/20180316_정치_정치일반.json
Error message : Expected object or value
-- Parttern : *_정치_정치일반.json [569165] --
-- Parttern : *_정치_청와대.json [103199] --
Error file : ../../NaverNews/navernews/news/news/2019/03/24/20190324_생활문화_건강정보 2.json
Error message : Expected object or value
Error file : ../../NaverNews/navernews/news/news/2019/03/23/20190323_생활문화_건강정보 2.json
Error message : Expected object or value
Error file : ../../NaverNews/navernews/news/news/2020/02/22/20200222_생활문화_건강정보 2.json
Error message : Expected object or value
Error file : ../../NaverNews/navernews/news/news/2019/03/24/20190324_생활문화_종교 2.json
Error message : Expected object or value
Error file : ../../NaverNews/navernews/news/news/2020/02/22/20200222_생활문화

In [8]:
df.to_csv('./data/2019.csv', index=False)

In [3]:
import pandas as pd
df = pd.read_csv('./data/2019.csv')

KeyboardInterrupt: 

In [2]:
a = df[df['category1'] not in ['IT/과학','경제','사회','생활/문화','정치']]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
df.shape

In [None]:
df.groupby([df['category1']])['url'].agg(['count'])

In [None]:
df.groupby([df['category1'], df['category2']])['url'].agg(['count'])