In [1]:
import numpy as np
import pandas as pd

In [2]:
import re

In [3]:
import time

In [4]:
df = pd.read_csv('googleplaystore.csv')

In [5]:
### Exploratory Data Analysis & Data Cleaning ###

In [6]:
print(df.head())

                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

  Reviews  Size     Installs  Type Price Content Rating  \
0     159   19M      10,000+  Free     0       Everyone   
1     967   14M     500,000+  Free     0       Everyone   
2   87510  8.7M   5,000,000+  Free     0       Everyone   
3  215644   25M  50,000,000+  Free     0           Teen   
4     967  2.8M     100,000+  Free     0       Everyone   

                      Genres      Last Updated         Current Ver  \
0               Art & Design   January 7, 2018               1.0.0   
1  Art & Design;Pretend 

In [7]:
df.Category.value_counts()
# 1.9 looks like an error

FAMILY                 1972
GAME                   1144
TOOLS                   843
MEDICAL                 463
BUSINESS                460
PRODUCTIVITY            424
PERSONALIZATION         392
COMMUNICATION           387
SPORTS                  384
LIFESTYLE               382
FINANCE                 366
HEALTH_AND_FITNESS      341
PHOTOGRAPHY             335
SOCIAL                  295
NEWS_AND_MAGAZINES      283
SHOPPING                260
TRAVEL_AND_LOCAL        258
DATING                  234
BOOKS_AND_REFERENCE     231
VIDEO_PLAYERS           175
EDUCATION               156
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     137
FOOD_AND_DRINK          127
HOUSE_AND_HOME           88
AUTO_AND_VEHICLES        85
LIBRARIES_AND_DEMO       85
WEATHER                  82
ART_AND_DESIGN           65
EVENTS                   64
PARENTING                60
COMICS                   60
BEAUTY                   53
1.9                       1
Name: Category, dtype: int64

In [8]:
# There was definitely an error with this row- and missing data, I am dropping it
df[df['Category'] == '1.9']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


In [9]:
df = df.drop(axis=0, index=10472)
df = df.reset_index(drop=True)

In [10]:
pd.set_option('display.max_rows', 500)
print(df.Genres.value_counts())

Tools                                    842
Entertainment                            623
Education                                549
Medical                                  463
Business                                 460
Productivity                             424
Sports                                   398
Personalization                          392
Communication                            387
Lifestyle                                381
Finance                                  366
Action                                   365
Health & Fitness                         341
Photography                              335
Social                                   295
News & Magazines                         283
Shopping                                 260
Travel & Local                           257
Dating                                   234
Books & Reference                        231
Arcade                                   220
Simulation                               200
Casual    

In [11]:
df_genres = df.Genres.str.split(';', expand=True)

In [12]:
df['Genre_One'] = df_genres[0]
df['Genre_Two'] = df_genres[1]

In [13]:
df['Content Rating'].value_counts()

Everyone           8714
Teen               1208
Mature 17+          499
Everyone 10+        414
Adults only 18+       3
Unrated               2
Name: Content Rating, dtype: int64

In [14]:
# Make varies a binary label for 'varies with device'
df['size_varies'] = np.where(df['Size'] == 'Varies with device', 1, 0)

In [15]:
df.Genre_One.value_counts()

Tools                      843
Entertainment              667
Education                  645
Medical                    463
Business                   460
Productivity               424
Sports                     402
Personalization            392
Communication              388
Lifestyle                  383
Action                     382
Finance                    366
Health & Fitness           343
Photography                335
Social                     295
News & Magazines           283
Casual                     270
Shopping                   260
Travel & Local             258
Arcade                     237
Books & Reference          234
Dating                     234
Simulation                 218
Video Players & Editors    178
Puzzle                     167
Maps & Navigation          137
Food & Drink               127
Role Playing               123
Racing                     119
Educational                112
Strategy                   111
Adventure                   91
House & 

In [16]:
# Genre one and two are mutually exclusive- this means I don't have to merge dummies
df.Genre_Two.value_counts()

Action & Adventure    138
Education             116
Pretend Play           90
Brain Games            70
Music & Video          47
Creativity             37
Name: Genre_Two, dtype: int64

In [17]:
df['Size'].value_counts()

Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
15M                    184
17M                    160
19M                    154
26M                    149
16M                    149
25M                    143
20M                    139
21M                    138
10M                    136
24M                    136
18M                    133
23M                    117
22M                    114
29M                    103
27M                     97
28M                     95
30M                     84
33M                     79
3.3M                    77
37M                     76
35M                     72
31M                     70
2.9M                    69
2.5M                    68
2.3M                    68
2.8M                    65
3.4M                    65
32M                     63
34M                     63
3.7M                    63
3.9M                    62
3.8M                    62
4

In [18]:
# Multiples 'size' by 1000 for 'k' (kilobytes) and by 1000000 for 'm' (megabytes)
df['size_in_bytes'] = [int(float(df['Size'][i][:-1]) * 1000) if df['Size'][i][-1:] == 'k'
 else int(float(df['Size'][i][:-1]) * 1000000) if df['Size'][i][-1:] == 'M'
else int(0) for i in range(len(df['Size']))]

In [19]:
np.mean(df['size_in_bytes'])

18152090.629151292

In [20]:
# turn 0 into np.mean 
# The number is adjusted to 2m on account of 0's biasing the original mean
df['size_in_bytes'] = np.where(df['size_in_bytes'] == 0, 20000000, df['size_in_bytes'])

In [21]:
# dropping size as it has been turned to several columns
df = df.drop(columns=['Size', 'Genres'])

In [22]:
df['Last Updated'] = pd.to_datetime(df['Last Updated'])

In [23]:
# Normalize all dates into days, which allows them to be normalized more easily with standardscaler
df['Last_up_days'] = pd.to_timedelta(df['Last Updated']).dt.days

In [24]:
df['Installs'] = df.Installs.str.replace('+', '')
df['Installs'] = df.Installs.str.replace(',', '').astype('int32')

In [25]:
# Note: You may want to switch Android Ver + Current Ver to Categoricals later if performance isn't good

In [26]:
# just keep the first two numbers for current ver, and nomralize 'Varies with device' to 1
re_pat = re.compile('\d+\.*\d+')

In [27]:
df['app_ver_varies'] = np.where(df['Current Ver'] == 'Varies with device', 1, 0)
df['Current Ver'] = df['Current Ver'].str.replace('Varies with device', '0.0')


In [28]:
# Converts version types into simple decimal. Returns 0.0 if not able to turn to simple decimal
def matcher(row):
    result = re.search(re_pat, row)
    if result == None:
        return 0.0
    else:
        return result.group(0)

In [29]:
df['Current Ver'] = df['Current Ver'].fillna('0.0')
df['Current Ver'] = df['Current Ver'].str.replace('Varies with device', '0.0')
df['Current Ver'] = df['Current Ver'].apply(matcher)

In [30]:
df

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Last Updated,Current Ver,Android Ver,Genre_One,Genre_Two,size_varies,size_in_bytes,Last_up_days,app_ver_varies
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Everyone,2018-01-07,1.0,4.0.3 and up,Art & Design,,0,19000000,17538,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Everyone,2018-01-15,2.0,4.0.3 and up,Art & Design,Pretend Play,0,14000000,17546,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Everyone,2018-08-01,1.2,4.0.3 and up,Art & Design,,0,8700000,17744,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Teen,2018-06-08,0.0,4.2 and up,Art & Design,,0,25000000,17690,1
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Everyone,2018-06-20,1.1,4.4 and up,Art & Design,Creativity,0,2800000,17702,0
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0,Everyone,2017-03-26,1.0,2.3 and up,Art & Design,,0,5600000,17251,0
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0,Everyone,2018-04-26,1.1,4.0.3 and up,Art & Design,,0,19000000,17647,0
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0,Everyone,2018-06-14,6.1,4.2 and up,Art & Design,,0,29000000,17696,0
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0,Everyone,2017-09-20,2.9,3.0 and up,Art & Design,,0,33000000,17429,0
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0,Everyone,2018-07-03,2.8,4.0.3 and up,Art & Design,Creativity,0,3100000,17715,0


In [31]:
df['and_ver_varies'] = np.where(df['Android Ver'] == 'Varies with device', 1, 0)
df['Android Ver'] = df['Android Ver'].fillna('0.0')
df['Android Ver'] = df['Android Ver'].apply(matcher)

In [32]:
df['Current Ver'] = df['Current Ver'].astype(float)
df['Android Ver'] = df['Android Ver'].astype(float)

In [33]:
df

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Last Updated,Current Ver,Android Ver,Genre_One,Genre_Two,size_varies,size_in_bytes,Last_up_days,app_ver_varies,and_ver_varies
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Everyone,2018-01-07,1.000000,4.0,Art & Design,,0,19000000,17538,0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Everyone,2018-01-15,2.000000,4.0,Art & Design,Pretend Play,0,14000000,17546,0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Everyone,2018-08-01,1.200000,4.0,Art & Design,,0,8700000,17744,0,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Teen,2018-06-08,0.000000,4.2,Art & Design,,0,25000000,17690,1,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Everyone,2018-06-20,1.100000,4.4,Art & Design,Creativity,0,2800000,17702,0,0
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0,Everyone,2017-03-26,1.000000,2.3,Art & Design,,0,5600000,17251,0,0
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0,Everyone,2018-04-26,1.100000,4.0,Art & Design,,0,19000000,17647,0,0
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0,Everyone,2018-06-14,6.100000,4.2,Art & Design,,0,29000000,17696,0,0
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0,Everyone,2017-09-20,2.900000,3.0,Art & Design,,0,33000000,17429,0,0
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0,Everyone,2018-07-03,2.800000,4.0,Art & Design,Creativity,0,3100000,17715,0,0


In [34]:
# Rounding outliers to 20 (not all version types adhere to counting up by ones, but this removes major outliers)
df['Current Ver'] = np.where(df['Current Ver'] > 20, 20, df['Current Ver'])

In [35]:
df['Price'] = df['Price'].str.replace('$','').astype(float)

In [36]:
### Feature Engineering ###

In [37]:
df = df.dropna(axis=0, how='any', subset=['Rating'])
df = df.reset_index(drop=True)

In [38]:
df

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Last Updated,Current Ver,Android Ver,Genre_One,Genre_Two,size_varies,size_in_bytes,Last_up_days,app_ver_varies,and_ver_varies
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0.00,Everyone,2018-01-07,1.000000,4.0,Art & Design,,0,19000000,17538,0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0.00,Everyone,2018-01-15,2.000000,4.0,Art & Design,Pretend Play,0,14000000,17546,0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,0.00,Everyone,2018-08-01,1.200000,4.0,Art & Design,,0,8700000,17744,0,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0.00,Teen,2018-06-08,0.000000,4.2,Art & Design,,0,25000000,17690,1,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0.00,Everyone,2018-06-20,1.100000,4.4,Art & Design,Creativity,0,2800000,17702,0,0
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0.00,Everyone,2017-03-26,1.000000,2.3,Art & Design,,0,5600000,17251,0,0
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0.00,Everyone,2018-04-26,1.100000,4.0,Art & Design,,0,19000000,17647,0,0
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0.00,Everyone,2018-06-14,6.100000,4.2,Art & Design,,0,29000000,17696,0,0
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0.00,Everyone,2017-09-20,2.900000,3.0,Art & Design,,0,33000000,17429,0,0
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0.00,Everyone,2018-07-03,2.800000,4.0,Art & Design,Creativity,0,3100000,17715,0,0


In [39]:

y = df[['Rating', 'App']]
df_numerical = df[['Reviews', 'Installs', 'Price', 'size_in_bytes', 
                   'Last_up_days','Android Ver', 'Current Ver']]

df_categorical = df[['Category', 'Type', 'Content Rating',
                     'Genre_One', 'Genre_Two']]

In [40]:
df_categorical_dummies = pd.get_dummies(df_categorical, prefix_sep='_', drop_first=True)

In [41]:
df_categorical_dummies.columns

Index(['Category_AUTO_AND_VEHICLES', 'Category_BEAUTY',
       'Category_BOOKS_AND_REFERENCE', 'Category_BUSINESS', 'Category_COMICS',
       'Category_COMMUNICATION', 'Category_DATING', 'Category_EDUCATION',
       'Category_ENTERTAINMENT', 'Category_EVENTS', 'Category_FAMILY',
       'Category_FINANCE', 'Category_FOOD_AND_DRINK', 'Category_GAME',
       'Category_HEALTH_AND_FITNESS', 'Category_HOUSE_AND_HOME',
       'Category_LIBRARIES_AND_DEMO', 'Category_LIFESTYLE',
       'Category_MAPS_AND_NAVIGATION', 'Category_MEDICAL',
       'Category_NEWS_AND_MAGAZINES', 'Category_PARENTING',
       'Category_PERSONALIZATION', 'Category_PHOTOGRAPHY',
       'Category_PRODUCTIVITY', 'Category_SHOPPING', 'Category_SOCIAL',
       'Category_SPORTS', 'Category_TOOLS', 'Category_TRAVEL_AND_LOCAL',
       'Category_VIDEO_PLAYERS', 'Category_WEATHER', 'Type_Paid',
       'Content Rating_Everyone', 'Content Rating_Everyone 10+',
       'Content Rating_Mature 17+', 'Content Rating_Teen',
       'Con

In [42]:
df_numerical

Unnamed: 0,Reviews,Installs,Price,size_in_bytes,Last_up_days,Android Ver,Current Ver
0,159,10000,0.00,19000000,17538,4.0,1.000000
1,967,500000,0.00,14000000,17546,4.0,2.000000
2,87510,5000000,0.00,8700000,17744,4.0,1.200000
3,215644,50000000,0.00,25000000,17690,4.2,0.000000
4,967,100000,0.00,2800000,17702,4.4,1.100000
5,167,50000,0.00,5600000,17251,2.3,1.000000
6,178,50000,0.00,19000000,17647,4.0,1.100000
7,36815,1000000,0.00,29000000,17696,4.2,6.100000
8,13791,1000000,0.00,33000000,17429,3.0,2.900000
9,121,10000,0.00,3100000,17715,4.0,2.800000


In [43]:
# Standard scale: installs, size in byes, last up days, Android Ver
# Leave: reviews, price, current ver # removing current ver as an experiment
df_num_scaled = df_numerical[['size_in_bytes', 'Last_up_days']]
df_num_unscaled = df_numerical[['Installs','Reviews', 'Price']]

In [44]:
from sklearn.preprocessing import StandardScaler

In [45]:
scaler = StandardScaler()

In [46]:
num_scaled = scaler.fit_transform(df_num_scaled)

In [47]:
df_num_scaled_final = pd.DataFrame(num_scaled, columns=df_num_scaled.columns)

In [48]:
X = pd.concat([df_categorical_dummies, df_num_scaled_final, df_num_unscaled], axis=1)

In [52]:
y_shape = y.values.reshape(-1, 1)

In [53]:
df_out = pd.concat([X, y], axis=1)

In [54]:
df_out.to_csv('processed play_store data.csv', index=False)