## Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import scipy.stats
import dateparser
from datetime import datetime
from matplotlib import pyplot as plt

## Import data

In [2]:
apps = pd.read_csv("googleplaystore.csv")
apps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


## Display data type

In [3]:
datadesc = []
for i in apps.columns:
    datadesc.append([
        i,
        apps[i].dtypes,
        apps[i].isnull().sum(),
        round((((apps[i].isnull().sum())/len(apps))*100),2),
        apps[i].nunique(),
        apps[i].drop_duplicates().sample().values
    ])

In [4]:
datadesc = pd.DataFrame(datadesc, columns = [
    "Data Features",
    "Data Type",
    "Number of Null",
    "Percentage of Null Data",
    "Number of Unique",
    "Sample of Unique Value"
])

In [5]:
datadesc

Unnamed: 0,Data Features,Data Type,Number of Null,Percentage of Null Data,Number of Unique,Sample of Unique Value
0,App,object,0,0.0,9660,[BV Mobile Apps]
1,Category,object,0,0.0,34,[FOOD_AND_DRINK]
2,Rating,float64,1474,13.6,40,[1.6]
3,Reviews,object,0,0.0,6002,[95201]
4,Size,object,0,0.0,462,[8.2M]
5,Installs,object,0,0.0,22,[500+]
6,Type,object,1,0.01,3,[Free]
7,Price,object,0,0.0,93,[$1.50]
8,Content Rating,object,1,0.01,6,[Everyone]
9,Genres,object,0,0.0,120,[House & Home]


## Dealing with data type

#### Change Reviews Datatype from Object to Int64

In [6]:
# apps = apps.Reviews.astype('int64')     -> THIS IS ERROR BECAUSE THERE IS A DATA WITH 3.0M NUMBER OF REVIEWS
apps[apps['Reviews'] == '3.0M' ]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


In [7]:
# We consider to drop the data because:
# 1. Category (and columns to the right) are messed up. We consider the data is shifted to the left one column.
# 2. Even if we shifted the data to the right place, there are still 2 NaN (Category and Genre).

# Let's try shifting the data first
apps.loc[10472, 'Category'] = np.nan
apps.loc[10472, 'Rating'] = 1.9
apps.loc[10472, 'Reviews'] = 19
apps.loc[10472, 'Size'] = "3.0M"
apps.loc[10472, 'Installs'] = "1000+"
apps.loc[10472, 'Type'] = "Free"
apps.loc[10472, 'Price'] = "0"
apps.loc[10472, 'Content Rating'] = "Everyone"
apps.loc[10472, 'Genres'] = np.nan
apps.loc[10472, 'Last Updated'] = "February 11, 2018"
apps.loc[10472, 'Current Ver'] = "1.0.19"
apps.loc[10472, 'Android Ver'] = "4.0 and up"

In [10]:
apps["Reviews"] = apps["Reviews"].astype('int64')
apps.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

#### Change Installs Datatypes from Object to Int64

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,3.1M,"10,000+",Free,0,Everyone,Art & Design;Creativity,"July 3, 2018",2.8,4.0.3 and up
17,350 Diy Room Decor Ideas,ART_AND_DESIGN,4.5,27,17M,"10,000+",Free,0,Everyone,Art & Design,"November 7, 2017",1.0,2.3 and up
25,Harley Quinn wallpapers HD,ART_AND_DESIGN,4.8,192,6.0M,"10,000+",Free,0,Everyone,Art & Design,"April 25, 2018",1.5,3.0 and up
28,Pencil Sketch Drawing,ART_AND_DESIGN,3.9,136,4.6M,"10,000+",Free,0,Everyone,Art & Design,"July 12, 2018",6.0,2.3 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10813,DICT.fr Mobile,BUSINESS,,20,2.7M,"10,000+",Free,0,Everyone,Business,"July 17, 2018",2.1.10,4.1 and up
10818,Gold Quote - Gold.fr,FINANCE,,96,1.5M,"10,000+",Free,0,Everyone,Finance,"May 19, 2016",2.3,2.2 and up
10824,Cardio-FR,MEDICAL,,67,82M,"10,000+",Free,0,Everyone,Medical,"July 31, 2018",2.2.2,4.4 and up
10828,Manga-FR - Anime Vostfr,COMICS,3.4,291,13M,"10,000+",Free,0,Everyone,Comics,"May 15, 2017",2.0.1,4.0 and up
