# *Data Cleaning of Google App Store Data*

Author: Pooja D \
Date: 04/May/2024 \
Dataset Link: https://www.kaggle.com/datasets/lava18/google-play-store-apps/

In [11]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# Reading data
df = pd.read_csv('googleplaystore.csv')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [13]:
# Checking shape of dataset
df.shape

(10841, 13)

In [14]:
# Checking for duplicates
df.duplicated().sum()

483

In [16]:
# Dropping duplicates
df = df.drop_duplicates()

In [17]:
# Confirming duplicates are removed
df.duplicated().sum()

0

In [18]:
# Checking for null values
df.isnull().sum() / len(df) * 100

App                0.000000
Category           0.009654
Rating            14.143657
Reviews            0.000000
Size               0.000000
Installs           0.000000
Type               0.009654
Price              0.000000
Content Rating     0.000000
Genres             0.009654
Last Updated       0.000000
Current Ver        0.077235
Android Ver        0.019309
dtype: float64

In [19]:
# Dropping null values from Rating column
df = df.dropna(subset=['Rating'])

In [21]:
# Checking null value in Category column
df[df['Category'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,,1.9,19,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up


In [22]:
# Checking for unique values in Category column
df['Category'].unique()

array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION', nan],
      dtype=object)

In [23]:
# Filling null value in Category column 
df['Category'] = df['Category'].fillna('PHOTOGRAPHY')

In [27]:
# Checking null value in Genres column
df[df['Genres'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,PHOTOGRAPHY,1.9,19,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up


In [28]:
# Filling null value in Genres column 
df['Genres'] = df['Genres'].fillna('photography')

In [29]:
# Checking for null values
df.isnull().sum() / len(df) * 100

App               0.000000
Category          0.000000
Rating            0.000000
Reviews           0.000000
Size              0.000000
Installs          0.000000
Type              0.000000
Price             0.000000
Content Rating    0.000000
Genres            0.000000
Last Updated      0.000000
Current Ver       0.044979
Android Ver       0.022490
dtype: float64