In [23]:
import pandas as pd
import numpy as np
import re

In [24]:
df = pd.read_excel('GooglePlaystore.xlsx')

#### PART 1: PREPROCESSING

In [25]:
# Delete row where 'Reviews' == 3.0M
df = df[df['Reviews'] != '3.0M']

# Delete rows where any column == 'Varies with device'
df = df[~df.isin(['Varies with device']).any(axis=1)]

# Strip trailing non-numeric characters from Android version
def clean_ver(ver):
    match = re.search('\d+\.\d+', str(ver))
    if match:
        return float(match.group())
    return np.nan

df['Android Ver'] = df['Android Ver'].apply(clean_ver)

# Convert installs to int and drop other rows
def clean_installs(installs):
    ret = re.sub('[,\+]', '', installs)
    try:
        ret = int(ret)
    except ValueError as ex:
        ret = np.nan
    return ret

df['Installs'] = df['Installs'].apply(clean_installs)
df = df[df['Installs'] != np.nan]

# Fill missing ratings and drop unrated unpopular apps
df = df[((df['Installs'] >= 50000) | (df['Reviews'] >= 100)) | ~df['Rating'].isna()]
df['Rating'] = df['Rating'].fillna(round(df['Rating'].mean(), 2))

# Turn size into int
def clean_size(size):
    match = re.search("(?P<num>\d*\.?\d+)(?P<exp>\w)", size)
    if (match):
        mdict = match.groupdict()
        xdict = {'M' : 6, 'K' : 3}
        num = float(mdict['num'])
        exp = xdict[mdict['exp'].upper()]
        return num * (10 ** exp)
    return size

df['Size'] = df['Size'].apply(clean_size)

#### PART 2: ANALYSIS

In [56]:
# Ratings data by category
stats = pd.DataFrame()
for category in df['Category'].unique():
    row = dict({'category' : category}, **df[df['Category'] == category]['Rating'].describe().to_dict())
    if (stats.empty):
        stats = pd.DataFrame(row, index=[0])
    else:
        stats = stats.append(row, ignore_index=True)
stats

Unnamed: 0,category,count,mean,std,min,25%,50%,75%,max
0,ART_AND_DESIGN,59.0,4.355424,0.366146,3.2,4.1,4.4,4.7,5.0
1,AUTO_AND_VEHICLES,62.0,4.146774,0.569935,2.1,3.9,4.25,4.5,4.9
2,BEAUTY,40.0,4.28275,0.357656,3.1,4.075,4.3,4.525,4.9
3,BOOKS_AND_REFERENCE,147.0,4.317075,0.453584,2.7,4.1,4.4,4.6,5.0
4,BUSINESS,246.0,4.11939,0.662594,1.0,3.825,4.3,4.5,5.0
5,COMICS,49.0,4.15449,0.550368,2.8,3.8,4.4,4.6,5.0
6,COMMUNICATION,210.0,4.101286,0.502646,1.0,3.9,4.2,4.4,5.0
7,DATING,174.0,3.959023,0.664399,1.0,3.6,4.1,4.4,5.0
8,EDUCATION,110.0,4.390636,0.255279,3.5,4.225,4.4,4.6,4.9
9,ENTERTAINMENT,86.0,4.145349,0.291308,3.0,4.0,4.2,4.3,4.7
