# 구글 플레이스토어 데이터 전처리

In [23]:
# csv 에서 데이터를 읽어와 행렬 객체로 만들 수 있는 모듈
# 다양한 데이터 전처리
import pandas as pd
# Numpy는 과학 연산을 위한 라이브러리
# 다차원 배열과 이런 배열을 처리
import numpy as np
# 시각화 라이브러리
import matplotlib.pyplot as plt

### 데이터 준비

In [24]:
# 데이터를 읽어온다
modeldata = pd.read_csv('new_data.csv')

In [25]:
# 데이터 정보 확인
modeldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 14 columns):
Unnamed: 0        8190 non-null int64
App               8190 non-null object
Category          8190 non-null object
Rating            8190 non-null float64
Reviews           8190 non-null int64
Size              8190 non-null object
Installs          8190 non-null int64
Type              8190 non-null object
Price             8190 non-null float64
Content Rating    8190 non-null object
Genres            8190 non-null object
Last Updated      8190 non-null object
Current Ver       8190 non-null object
Android Ver       8190 non-null object
dtypes: float64(2), int64(3), object(9)
memory usage: 895.9+ KB


In [26]:
# Category는 group으로 만들기 위해 숫자로 바꾸어준다
category_map = {}
category_list = []
i = 0
for data in modeldata['Category']:
    if data not in category_map.keys():
        category_list.append(data)
        category_map[data] = i
        i += 1
        
print(category_map)

modeldata['Category'] = modeldata['Category'].map(category_map)

{'ART_AND_DESIGN': 0, 'AUTO_AND_VEHICLES': 1, 'BEAUTY': 2, 'BOOKS_AND_REFERENCE': 3, 'BUSINESS': 4, 'COMICS': 5, 'COMMUNICATION': 6, 'DATING': 7, 'EDUCATION': 8, 'ENTERTAINMENT': 9, 'EVENTS': 10, 'FINANCE': 11, 'FOOD_AND_DRINK': 12, 'HEALTH_AND_FITNESS': 13, 'HOUSE_AND_HOME': 14, 'LIBRARIES_AND_DEMO': 15, 'LIFESTYLE': 16, 'GAME': 17, 'FAMILY': 18, 'MEDICAL': 19, 'SOCIAL': 20, 'SHOPPING': 21, 'PHOTOGRAPHY': 22, 'SPORTS': 23, 'TRAVEL_AND_LOCAL': 24, 'TOOLS': 25, 'PERSONALIZATION': 26, 'PRODUCTIVITY': 27, 'PARENTING': 28, 'WEATHER': 29, 'VIDEO_PLAYERS': 30, 'NEWS_AND_MAGAZINES': 31, 'MAPS_AND_NAVIGATION': 32}


In [27]:
# 카테고리가 숫자로 잘 변환 되었는지 확인
modeldata.head(5) 

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,0,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,1,Coloring book moana,0,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",0,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,3,Sketch - Draw & Paint,0,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,4,Pixel Draw - Number Art Coloring Book,0,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [28]:
# size 전처리
def remove_m(x):
    if 'M' in x:
        x = x.replace('M','')
        return float(x)
    else:
        x = np.nan
        return x

modeldata['Size'] = modeldata['Size'].apply(lambda a : remove_m(a))  
modeldata.head(5)

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,0,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,1,Coloring book moana,0,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",0,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,3,Sketch - Draw & Paint,0,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,4,Pixel Draw - Number Art Coloring Book,0,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [29]:
# 결측값 확인
modeldata.isnull().sum()

Unnamed: 0           0
App                  0
Category             0
Rating               0
Reviews              0
Size              1424
Installs             0
Type                 0
Price                0
Content Rating       0
Genres               0
Last Updated         0
Current Ver          0
Android Ver          0
dtype: int64

In [30]:
# Size의 결측값을 평균값으로 대체
tmp = modeldata[modeldata['Size'] != np.nan ]
sizemean = np.mean(tmp['Size'])
modeldata['Size'].fillna(sizemean, inplace=True)

In [31]:
# Content_rating 똰 숫자형으로 바꿔준다
content_map = {}
i = 0
for data in modeldata['Content Rating']:
    if data not in content_map.keys():
        content_map[data] = i
        i += 1
        
print(content_map)

modeldata['Content Rating'] = modeldata['Content Rating'].map(content_map)

{'Everyone': 0, 'Teen': 1, 'Everyone 10+': 2, 'Mature 17+': 3, 'Adults only 18+': 4, 'Unrated': 5}


In [32]:
# Type은 price와 같은 말이므로 drop (유료 vs 무료이므로)
# Genres는 Category랑 중복이니까 drop시키고
# 나머지 필요없는 정보들 drop
drop_list= ['Type','Genres','Last Updated','Current Ver','Android Ver']
modeldata = modeldata.drop(drop_list,axis=1)

In [33]:
modeldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 9 columns):
Unnamed: 0        8190 non-null int64
App               8190 non-null object
Category          8190 non-null int64
Rating            8190 non-null float64
Reviews           8190 non-null int64
Size              8190 non-null float64
Installs          8190 non-null int64
Price             8190 non-null float64
Content Rating    8190 non-null int64
dtypes: float64(3), int64(5), object(1)
memory usage: 575.9+ KB


### 필요한 데이터를 모두 int/float로 바꾸었으므로 예측 모델을 수립할 수 있는 조건 완성 !

In [34]:
modeldata.corr()

Unnamed: 0.1,Unnamed: 0,Category,Rating,Reviews,Size,Installs,Price,Content Rating
Unnamed: 0,1.0,0.264716,-0.122101,-0.093405,-0.05962,-0.116254,-0.003319,-0.068979
Category,0.264716,1.0,-0.032198,0.002158,-0.037655,0.014403,-0.01748,-0.088391
Rating,-0.122101,-0.032198,1.0,0.055076,0.051827,0.040131,-0.021215,0.007854
Reviews,-0.093405,0.002158,0.055076,1.0,0.08532,0.62405,-0.007791,0.041133
Size,-0.05962,-0.037655,0.051827,0.08532,1.0,0.055491,-0.024086,0.162249
Installs,-0.116254,0.014403,0.040131,0.62405,0.055491,1.0,-0.009656,0.022131
Price,-0.003319,-0.01748,-0.021215,-0.007791,-0.024086,-0.009656,1.0,-0.014643
Content Rating,-0.068979,-0.088391,0.007854,0.041133,0.162249,0.022131,-0.014643,1.0


In [36]:
# 전처리된 데이터 저장
modeldata.to_csv('final_data.csv')