In [23]:
import pandas as pd
import numpy as np
import re

In [24]:
df = pd.read_excel('GooglePlaystore.xlsx')

#### PART 1: PREPROCESSING

In [25]:
# Delete row where 'Reviews' == 3.0M
df = df[df['Reviews'] != '3.0M']

# Delete rows where any column == 'Varies with device'
df = df[~df.isin(['Varies with device']).any(axis=1)]

# Strip trailing non-numeric characters from Android version
def clean_ver(ver):
    match = re.search('\d+\.\d+', str(ver))
    if match:
        return float(match.group())
    return np.nan

df['Android Ver'] = df['Android Ver'].apply(clean_ver)

# Convert installs to int and drop other rows
def clean_installs(installs):
    ret = re.sub('[,\+]', '', installs)
    try:
        ret = int(ret)
    except ValueError as ex:
        ret = np.nan
    return ret

df['Installs'] = df['Installs'].apply(clean_installs)
df = df[df['Installs'] != np.nan]

# Fill missing ratings and drop unrated unpopular apps
df = df[((df['Installs'] >= 50000) | (df['Reviews'] >= 100)) | ~df['Rating'].isna()]
df['Rating'] = df['Rating'].fillna(round(df['Rating'].mean(), 2))

# Turn size into int
def clean_size(size):
    match = re.search("(?P<num>\d*\.?\d+)(?P<exp>\w)", size)
    if (match):
        mdict = match.groupdict()
        xdict = {'M' : 6, 'K' : 3}
        num = float(mdict['num'])
        exp = xdict[mdict['exp'].upper()]
        return num * (10 ** exp)
    return size

df['Size'] = df['Size'].apply(clean_size)

#### PART 2: ANALYSIS