In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


In [2]:
# import and drop duplicate
df = pd.read_csv('data/googleplaystore.csv', ).drop_duplicates()

# rename columns to lower case, underscored space
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# drop version columns, won't be needed in regression
df = df.iloc[:, :-2]

In [3]:
df.head(1).T

Unnamed: 0,0
app,Photo Editor & Candy Camera & Grid & ScrapBook
category,ART_AND_DESIGN
rating,4.1
reviews,159
size,19M
installs,"10,000+"
type,Free
price,0
content_rating,Everyone
genres,Art & Design


In [4]:
# find any null rows
null_cols = df.isna().any()[df.isna().any().values] # just drop them

# drop null rows
df = df.dropna()

In [5]:
# clean column App, remove duplicates
dup_apps = df.app.value_counts()[df.app.value_counts() > 1].index
print('Duplicate app names: ', dup_apps)
df = df.sort_values(by='last_updated').drop_duplicates('app', keep='last')

Duplicate app names:  Index(['ROBLOX', '8 Ball Pool', 'Zombie Catchers', 'Helix Jump',
       'Bubble Shooter', 'Bowmasters', 'Temple Run 2',
       'Duolingo: Learn Languages Free', 'Granny', 'slither.io',
       ...
       'Truecaller: Caller ID, SMS spam blocking & Dialer', 'Fallout Shelter',
       'Booking.com Travel Deals', 'Facebook Lite', 'Farming Simulator 14',
       'YouTube', 'NOOK App for NOOK Devices',
       'Any.do: To-do list, Calendar, Reminders & Planner',
       'Home Security Camera WardenCam - reuse old phones',
       'Zomato - Restaurant Finder and Food Delivery App'],
      dtype='object', length=521)


In [6]:
# standardize size column and conver to float

def covert2mb(val):
    val = val.lower()
    unit = val[-1]
    if 'm' == unit:
        return float(val.replace('m',''))
    elif 'k' == unit:
        return float(val.replace('k','')) / 1024
    else:
        return 0  # to be removed from df

df['app_size'] = df['size'].apply(covert2mb)


In [7]:
# remove size 'varies by device'
df = df[df['app_size']>0]
# drop size column
df = df.drop(columns='size')

In [8]:
# convert installs from interval to integer
df.installs = df.installs.str.replace('[\,|\+]+', '').astype('int')

In [31]:
# convert reviews to integer
df.reviews = df.reviews.astype('int')

In [9]:
# convert price to float
df[df.price != '0'].price.value_counts()
df['price'] = df.price.str.replace('$','').astype(float)

In [10]:
# clean cateogry
df.category.value_counts()
df.genres.value_counts().head(50)

df['category'] = df['category'].str.title()


In [11]:
# encode last_updated date to number

from datetime import datetime

df['last_updated'] = pd.to_datetime(df.last_updated)
def date2num(val):
  start = datetime(2010,1,1)
  diff = val - start
  return diff.days

df['last_updated_encoded'] = df.last_updated.apply(date2num)


In [12]:
df.last_updated_encoded.describe()
df = df.drop(columns='last_updated')

In [13]:
# drop genre and name because too many categories
df = df.drop(columns=['app', 'genres'])

In [14]:
df.head()

Unnamed: 0,category,rating,reviews,installs,type,price,content_rating,app_size,last_updated_encoded
9116,Health_And_Fitness,4.1,131,5000,Free,0.0,Everyone,3.4,2282
3304,Tools,3.9,12388,1000000,Free,0.0,Everyone,5.3,2282
8901,Communication,4.3,1133539,10000000,Free,0.0,Everyone,4.7,2282
10589,Travel_And_Local,4.3,100,5000,Free,0.0,Everyone,13.0,2647
7834,Game,4.3,8668,500000,Free,0.0,Teen,9.1,2647


In [33]:
# log transform dataset
df[['installs','reviews']] = np.log(df[['installs','reviews']])

In [41]:
# lump categories together
category_lump = {'Tools':[ 'Auto_And_Vehicles',  'Weather', 'Tools',  'Productivity', 'Personalization', 'Medical',  'Maps_And_Navigation',  'Libraries_And_Demo']
           ,'Entertainment':['Video_Players', 'Sports', 'Photography', 'Entertainment',  'Game','Art_And_Design', 'Comics', 'Events']
           ,'Lifestyle': ['Health_And_Fitness', 'Beauty','Travel_And_Local','Shopping','Lifestyle','Parenting','Food_And_Drink' ,'Family','House_And_Home',  'Libraries_And_Demo']
           ,'Social':['Social','Dating', 'Communication']
           ,'Education/Business':[ 'Books_And_Reference', 'Business', 'Education', 'News_And_Magazines','Finance']
           }
cat_assign = {cat: key for key, cat_ls in category_lump.items() for cat in cat_ls}

df['category'] = df.category.apply(lambda x : cat_assign[x])

In [43]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2,)
train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)