# 청와대 청원 분석 

1. 청와대 청원 웹크롤링 
2. 텍스트 클렌징, 단어 토큰화, 불용어 제거, 어근 추출 
3. TF-IDF 기반 벡터화 수행(lemmatization) 
4. kmeans 군집화: n 바꿔가면서 최적의 군집단위 찾기 
5. cluster_centers_로 각 군집별 핵심 단어 찾기 

## 파일 불러오기

In [1]:
import pandas as pd
import numpy as np 
import re

In [2]:
df1 = pd.read_excel('우한폐렴-BlueHouse-Petition-data.xlsx', index_col=0, header=1, encoding='utf-8')
df2 = pd.read_excel('코로나-BlueHouse-Petition-data.xlsx', index_col=0, header=1, encoding='utf-8')
df3 = pd.read_excel('코비드-BlueHouse-Petition-data.xlsx', index_col=0, encoding='utf-8')
df4 = pd.read_excel('corona-BlueHouse-Petition-data.xlsx', index_col=0, header=1, encoding='utf-8')
df5 = pd.read_excel('covid-BlueHouse-Petition-data.xlsx', index_col=0, header=1, encoding='utf-8')
df6 = pd.read_excel('covid19-BlueHouse-Petition-data.xlsx', index_col=0, encoding='utf-8')

In [3]:
print(df1.shape, df2.shape, df3.shape, df4.shape, df5.shape, df6.shape)

(156, 8) (2996, 8) (3, 8) (2, 8) (43, 8) (10, 8)


In [4]:
petition_raw = pd.concat([df1, df2, df3, df4, df5, df6])
petition_raw.shape

(3210, 8)

## 중복 데이터 제거

In [5]:
petition = petition_raw.drop_duplicates()
petition.shape

(3105, 8)

## 2020년 이외 데이터 제외

In [6]:
month = pd.to_datetime(petition['청원시작일']).dt.month
year = pd.to_datetime(petition['청원시작일']).dt.year
petition['월'] = month
petition['년'] = year
petition = petition.sort_values(by='월')
petition.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


(3105, 10)

In [7]:
print('2020년 이외의 데이터: ', len(petition[petition['년'] < 2020]))
petition = petition[petition['년'] == 2020]
petition.shape

2020년 이외의 데이터:  10


(3095, 10)

## 텍스트 데이터 전처리

In [8]:
def preprocessing(text): 
    # 개행문자 제거 
    text = re.sub('\\\n',' ', text)
    # 한글 영문만 남기고 모두 제거 (숫자, 특수문자도 제거)
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    return text 

In [13]:
%time petition['청원내용'] = petition['청원내용'].apply(preprocessing)

CPU times: user 255 ms, sys: 23.4 ms, total: 278 ms
Wall time: 278 ms


In [16]:
from konlpy.tag import Okt  
okt=Okt()  

okt.morphs(petition['청원내용'])

TypeError: startJVM() got an unexpected keyword argument 'convertStrings'

### 불용어 제거

In [None]:
def remove_stopwords(text):
    tokens = text.split(' ')
    stops = ['수', '현', '있는', '있습니다', '그', '년도', '합니다', '하는', '및', '제', '할', '하고', 
             '더', '대한', '한', '그리고', '월', '저는', '없는', '입니다', '등', '일', '많은', '이런', '것은', 
             '왜','같은', '같습니다', '없습니다', '위해', '한다']
    meaningful_words = [w for w in tokens if not w in stops]
    return ' '.join(meaningful_words)

In [None]:
tokens_pre = tokens.apply(remove_stopwords)
tokens_pre[:3]

In [None]:
petition_feb = petition[(petition['월'] == 1) | (petition['월'] == 2)]
petition_mar = petition[petition['월'] == 3]
petition_apr = petition[petition['월'] == 4]
petition_may = petition[petition['월'] == 5]
petition_jun = petition[petition['월'] == 6]
petition_jul = petition[petition['월'] == 7]