In [2]:
import pandas as pd
import numpy as np
import re

In [27]:
df = pd.read_excel('GooglePlaystore.xlsx')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,2018-01-07 00:00:00,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,2018-01-15 00:00:00,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,2018-08-01 00:00:00,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,2018-06-08 00:00:00,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,2018-06-20 00:00:00,1.1,4.4 and up


In [28]:
# Delete row where 'Reviews' == 3.0M
df = df[df['Reviews'] != '3.0M']
len(df)

10840

In [29]:
# Delete rows where any column == 'Varies with device'
df = df[~df.isin(['Varies with device']).any(axis=1)]
len(df)

9059

In [30]:
# Strip trailing non-numeric characters from Android version
def clean_ver(ver):
    match = re.search('\d+\.\d+', str(ver))
    if match:
        return float(match.group())
    return np.nan

df['Android Ver'] = df['Android Ver'].apply(clean_ver)

In [31]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,2018-01-07 00:00:00,1.0.0,4.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,2018-01-15 00:00:00,2.0.0,4.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,2018-08-01 00:00:00,1.2.4,4.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,2018-06-20 00:00:00,1.1,4.4
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6M,"50,000+",Free,0,Everyone,Art & Design,2017-03-26 00:00:00,1,2.3


In [32]:
# Convert installs to int and drop other rows
def clean_installs(installs):
    ret = re.sub('[,\+]', '', installs)
    try:
        ret = int(ret)
    except ValueError as ex:
        ret = np.nan
    return ret

df['Installs'] = df['Installs'].apply(clean_installs)
df = df[df['Installs'] != np.nan]

In [38]:
# Fill missing ratings and drop unrated unpopular apps
df = df[((df['Installs'] >= 50000) | (df['Reviews'] >= 100)) | ~df['Rating'].isna()]
df['Rating'] = df['Rating'].fillna(round(df['Rating'].mean(), 2))

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
16,Photo Designer - Write your name with shapes,ART_AND_DESIGN,4.7,3632,5.5M,500000,Free,0,Everyone,Art & Design,2018-07-31 00:00:00,3.1,4.1
17,350 Diy Room Decor Ideas,ART_AND_DESIGN,4.5,27,17M,10000,Free,0,Everyone,Art & Design,2017-11-07 00:00:00,1,2.3
18,FlipaClip - Cartoon animation,ART_AND_DESIGN,4.3,194216,39M,5000000,Free,0,Everyone,Art & Design,2018-08-03 00:00:00,2.2.5,4.0
19,ibis Paint X,ART_AND_DESIGN,4.6,224399,31M,10000000,Free,0,Everyone,Art & Design,2018-07-30 00:00:00,5.5.4,4.1
20,Logo Maker - Small Business,ART_AND_DESIGN,4.0,450,14M,100000,Free,0,Everyone,Art & Design,2018-04-20 00:00:00,4,4.1
21,Boys Photo Editor - Six Pack & Men's Suit,ART_AND_DESIGN,4.1,654,12M,100000,Free,0,Everyone,Art & Design,2018-03-20 00:00:00,1.1,4.0
22,Superheroes Wallpapers | 4K Backgrounds,ART_AND_DESIGN,4.7,7699,4.2M,500000,Free,0,Everyone 10+,Art & Design,2018-07-12 00:00:00,2.2.6.2,4.0
24,HD Mickey Minnie Wallpapers,ART_AND_DESIGN,4.7,118,23M,50000,Free,0,Everyone,Art & Design,2018-07-07 00:00:00,1.1.3,4.1
25,Harley Quinn wallpapers HD,ART_AND_DESIGN,4.8,192,6.0M,10000,Free,0,Everyone,Art & Design,2018-04-25 00:00:00,1.5,3.0
26,Colorfit - Drawing & Coloring,ART_AND_DESIGN,4.7,20260,25M,500000,Free,0,Everyone,Art & Design;Creativity,2017-10-11 00:00:00,1.0.8,4.0
