# *Data Cleaning of Google App Store Data*

Author: Pooja D \
Date: 04/May/2024 \
[Dataset Link](https://www.kaggle.com/datasets/lava18/google-play-store-apps/)

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('googleplaystore.csv')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
df.isnull().sum()

App                  0
Category             1
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       0
Genres               1
Last Updated         0
Current Ver          8
Android Ver          2
dtype: int64

In [5]:
df[df['Category'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,,1.9,19,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up


In [6]:
df['Category'].unique()

array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION', nan],
      dtype=object)

In [7]:
df[df['Category'] == 'PHOTOGRAPHY'].sort_values(by='Category')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2801,TouchNote: Cards & Gifts,PHOTOGRAPHY,4.1,19232,28M,"1,000,000+",Free,0,Everyone,Photography,"August 6, 2018",7.4.2,4.4 and up
6191,Background Changer & Eraser,PHOTOGRAPHY,4.0,2076,11M,"500,000+",Free,0,Teen,Photography,"May 21, 2018",2.5,4.1 and up
6184,Background Eraser,PHOTOGRAPHY,4.5,267378,1.9M,"10,000,000+",Free,0,Everyone,Photography,"July 11, 2015",1.4.6,4.0 and up
6178,Change photo background,PHOTOGRAPHY,3.8,28660,6.4M,"5,000,000+",Free,0,Everyone,Photography,"March 21, 2018",1.0.12,4.0.3 and up
6166,Auto Background Changer,PHOTOGRAPHY,4.0,35188,6.2M,"1,000,000+",Free,0,Everyone,Photography,"February 22, 2018",1.0.11,3.2 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2909,"Candy Camera - selfie, beauty camera, photo ed...",PHOTOGRAPHY,4.4,3368646,Varies with device,"100,000,000+",Free,0,Everyone,Photography,"July 16, 2018",4.47,4.0.3 and up
2908,Retrica,PHOTOGRAPHY,4.3,6120977,Varies with device,"100,000,000+",Free,0,Everyone,Photography,"June 28, 2018",6.1.0,Varies with device
2907,EyeEm - Camera & Photo Filter,PHOTOGRAPHY,4.2,215343,44M,"10,000,000+",Free,0,Everyone,Photography,"July 14, 2018",6.4.3,4.0.3 and up
2905,Camera MX - Free Photo & Video Camera,PHOTOGRAPHY,4.3,244371,Varies with device,"10,000,000+",Free,0,Everyone,Photography,"July 5, 2018",Varies with device,Varies with device


In [8]:
df['Category'] = df['Category'].fillna('PHOTOGRAPHY')

In [9]:
df['Category'].isnull().sum()

0

In [10]:
df[df['Type'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
9148,Command & Conquer: Rivals,FAMILY,,0,Varies with device,0,,0,Everyone 10+,Strategy,"June 28, 2018",Varies with device,Varies with device


In [11]:
df['Type'] = df['Type'].fillna('Free')

In [12]:
df["Type"].unique()

array(['Free', 'Paid'], dtype=object)

In [13]:
df[df['Genres'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,PHOTOGRAPHY,1.9,19,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up


In [14]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [15]:
df['Genres'] = df['Genres'].fillna('photography')

In [16]:
df['Genres'].isnull().sum()

0

In [17]:
df[df['Android Ver'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4453,[substratum] Vacuum: P,PERSONALIZATION,4.4,230,11M,"1,000+",Paid,$1.49,Everyone,Personalization,"July 20, 2018",4.4,
4490,Pi Dark [substratum],PERSONALIZATION,4.5,189,2.1M,"10,000+",Free,0,Everyone,Personalization,"March 27, 2018",1.1,


In [18]:
df['App'].value_counts().get('[substratum] Vacuum: P', 0)

1

In [19]:
df['App'].value_counts().get('Pi Dark [substratum]', 0)

1

In [20]:
df.shape

(10841, 13)

In [21]:
df.duplicated().sum()

483

In [22]:
df = df.drop_duplicates()

In [23]:
df.shape

(10358, 13)

In [24]:
df.isnull().sum()

App                  0
Category             0
Rating            1465
Reviews              0
Size                 0
Installs             0
Type                 0
Price                0
Content Rating       0
Genres               0
Last Updated         0
Current Ver          8
Android Ver          2
dtype: int64

In [25]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

# Numeric features in data
1.  Rating
2.  Reviews
3.  Size
4.  Installs
5.  Price

In [26]:
df['Size'].value_counts()

Size
Varies with device    1526
11M                    188
13M                    186
12M                    186
14M                    182
                      ... 
430k                     1
429k                     1
200k                     1
460k                     1
619k                     1
Name: count, Length: 461, dtype: int64

In [27]:
def convert_to_numeric(size):
    if 'M' in size:
        return size.replace('M', '')
    elif 'K' in size:
        return str(round(float(size.replace('K', '')) / 1024, 1))

# Apply the function to the 'Size' column
df['Size'] = df['Size'].apply(convert_to_numeric)

print(df['Size'])

0          19
1          14
2         8.7
3          25
4         2.8
         ... 
10836      53
10837     3.6
10838     9.5
10839    None
10840      19
Name: Size, Length: 10358, dtype: object


In [28]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [29]:
df['Size'].unique()

array(['19', '14', '8.7', '25', '2.8', '5.6', '29', '33', '3.1', '28',
       '12', '20', '21', '37', '2.7', '5.5', '17', '39', '31', '4.2',
       '7.0', '23', '6.0', '6.1', '4.6', '9.2', '5.2', '11', '24', None,
       '9.4', '15', '10', '1.2', '26', '8.0', '7.9', '56', '57', '35',
       '54', '3.6', '5.7', '8.6', '2.4', '27', '2.5', '16', '3.4', '8.9',
       '3.9', '2.9', '38', '32', '5.4', '18', '1.1', '2.2', '4.5', '9.8',
       '52', '9.0', '6.7', '30', '2.6', '7.1', '3.7', '22', '7.4', '6.4',
       '3.2', '8.2', '9.9', '4.9', '9.5', '5.0', '5.9', '13', '73', '6.8',
       '3.5', '4.0', '2.3', '7.2', '2.1', '42', '7.3', '9.1', '55', '6.5',
       '1.5', '7.5', '51', '41', '48', '8.5', '46', '8.3', '4.3', '4.7',
       '3.3', '40', '7.8', '8.8', '6.6', '5.1', '61', '66', '8.4', '44',
       '1.6', '6.2', '53', '1.4', '3.0', '5.8', '3.8', '9.6', '45', '63',
       '49', '77', '4.4', '4.8', '70', '6.9', '9.3', '10.0', '8.1', '36',
       '84', '97', '2.0', '1.9', '1.8', '5.3', '4

In [30]:
df['Size'] = df['Size'].replace('None', 0)

In [31]:
df['Size'] = df['Size'].fillna('0')

In [32]:
df['Size'].unique()

array(['19', '14', '8.7', '25', '2.8', '5.6', '29', '33', '3.1', '28',
       '12', '20', '21', '37', '2.7', '5.5', '17', '39', '31', '4.2',
       '7.0', '23', '6.0', '6.1', '4.6', '9.2', '5.2', '11', '24', '0',
       '9.4', '15', '10', '1.2', '26', '8.0', '7.9', '56', '57', '35',
       '54', '3.6', '5.7', '8.6', '2.4', '27', '2.5', '16', '3.4', '8.9',
       '3.9', '2.9', '38', '32', '5.4', '18', '1.1', '2.2', '4.5', '9.8',
       '52', '9.0', '6.7', '30', '2.6', '7.1', '3.7', '22', '7.4', '6.4',
       '3.2', '8.2', '9.9', '4.9', '9.5', '5.0', '5.9', '13', '73', '6.8',
       '3.5', '4.0', '2.3', '7.2', '2.1', '42', '7.3', '9.1', '55', '6.5',
       '1.5', '7.5', '51', '41', '48', '8.5', '46', '8.3', '4.3', '4.7',
       '3.3', '40', '7.8', '8.8', '6.6', '5.1', '61', '66', '8.4', '44',
       '1.6', '6.2', '53', '1.4', '3.0', '5.8', '3.8', '9.6', '45', '63',
       '49', '77', '4.4', '4.8', '70', '6.9', '9.3', '10.0', '8.1', '36',
       '84', '97', '2.0', '1.9', '1.8', '5.3', '47

In [33]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [34]:
df['Size'] = pd.to_numeric(df['Size'], errors='coerce')

In [35]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size              float64
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [36]:
df.sample(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
7319,CG Backgrounds,FAMILY,4.0,8,3.6,500+,Free,0,Everyone,Entertainment,"June 19, 2018",8.1,4.0.3 and up
6050,Millionaire : Who want to be?,GAME,4.4,4396,40.0,"100,000+",Free,0,Everyone,Trivia,"April 12, 2017",1.5,2.3 and up
5696,AV Tools,TOOLS,4.4,631,0.0,"100,000+",Free,0,Everyone,Tools,"September 22, 2016",Varies with device,Varies with device
1041,mobLee Events,EVENTS,,11,11.0,"5,000+",Free,0,Everyone,Events,"May 30, 2018",3.05.30.16,4.1 and up
8079,Ambient CX,BUSINESS,,0,7.5,5+,Free,0,Everyone,Business,"May 9, 2017",1.0,2.3.3 and up
5855,Ay Vamos - PJ. Balvin - Piano,GAME,,0,29.0,5+,Free,0,Everyone,Arcade,"July 9, 2018",1.0,4.1 and up
5759,Square Analog Clock AW-7,TOOLS,4.2,294,3.0,"50,000+",Free,0,Everyone,Tools,"March 10, 2018",2.0,4.3 and up
3413,Waterfall Live Wallpaper,PERSONALIZATION,4.1,112977,13.0,"10,000,000+",Free,0,Everyone,Personalization,"May 22, 2018",3.6,4.1 and up
4492,The Q - Live Trivia Game Network,GAME,2.7,1486,14.0,"100,000+",Free,0,Everyone,Trivia,"August 3, 2018",2.4.8,5.0 and up
6503,BNCR Token Celular,FINANCE,3.7,913,0.0,"100,000+",Free,0,Everyone,Finance,"July 20, 2012",1.0,1.5 and up


In [37]:
df['Price'] = pd.to_numeric(df['Price'].str.replace('$', ''))

In [38]:
df['Price'].unique()

array([  0.  ,   4.99,   3.99,   6.99,   1.49,   2.99,   7.99,   5.99,
         3.49,   1.99,   9.99,   7.49,   0.99,   9.  ,   5.49,  10.  ,
        24.99,  11.99,  79.99,  16.99,  14.99,   1.  ,  29.99,  12.99,
         2.49,  10.99,   1.5 ,  19.99,  15.99,  33.99,  74.99,  39.99,
         3.95,   4.49,   1.7 ,   8.99,   2.  ,   3.88,  25.99, 399.99,
        17.99, 400.  ,   3.02,   1.76,   4.84,   4.77,   1.61,   2.5 ,
         1.59,   6.49,   1.29,   5.  ,  13.99, 299.99, 379.99,  37.99,
        18.99, 389.99,  19.9 ,   8.49,   1.75,  14.  ,   4.85,  46.99,
       109.99, 154.99,   3.08,   2.59,   4.8 ,   1.96,  19.4 ,   3.9 ,
         4.59,  15.46,   3.04,   4.29,   2.6 ,   3.28,   4.6 ,  28.99,
         2.95,   2.9 ,   1.97, 200.  ,  89.99,   2.56,  30.99,   3.61,
       394.99,   1.26,   1.2 ,   1.04])

In [39]:
df['Installs'].unique()

array(['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+',
       '50,000+', '1,000,000+', '10,000,000+', '5,000+', '100,000,000+',
       '1,000,000,000+', '1,000+', '500,000,000+', '50+', '100+', '500+',
       '10+', '1+', '5+', '0+', '0'], dtype=object)

In [40]:
df['Installs'] = pd.to_numeric(df['Installs'].str.replace('+', '').str.replace(',', ''))


In [41]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size              float64
Installs            int64
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [42]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [43]:
df.to_csv('cleaned.csv')

In [44]:
df['Size'].unique()

array([ 19. ,  14. ,   8.7,  25. ,   2.8,   5.6,  29. ,  33. ,   3.1,
        28. ,  12. ,  20. ,  21. ,  37. ,   2.7,   5.5,  17. ,  39. ,
        31. ,   4.2,   7. ,  23. ,   6. ,   6.1,   4.6,   9.2,   5.2,
        11. ,  24. ,   0. ,   9.4,  15. ,  10. ,   1.2,  26. ,   8. ,
         7.9,  56. ,  57. ,  35. ,  54. ,   3.6,   5.7,   8.6,   2.4,
        27. ,   2.5,  16. ,   3.4,   8.9,   3.9,   2.9,  38. ,  32. ,
         5.4,  18. ,   1.1,   2.2,   4.5,   9.8,  52. ,   9. ,   6.7,
        30. ,   2.6,   7.1,   3.7,  22. ,   7.4,   6.4,   3.2,   8.2,
         9.9,   4.9,   9.5,   5. ,   5.9,  13. ,  73. ,   6.8,   3.5,
         4. ,   2.3,   7.2,   2.1,  42. ,   7.3,   9.1,  55. ,   6.5,
         1.5,   7.5,  51. ,  41. ,  48. ,   8.5,  46. ,   8.3,   4.3,
         4.7,   3.3,  40. ,   7.8,   8.8,   6.6,   5.1,  61. ,  66. ,
         8.4,  44. ,   1.6,   6.2,  53. ,   1.4,   3. ,   5.8,   3.8,
         9.6,  45. ,  63. ,  49. ,  77. ,   4.4,   4.8,  70. ,   6.9,
         9.3,   8.1,

In [45]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [46]:
df = df.rename(columns={'Size': 'Size (Mbs)'})
df = df.rename(columns={'Price': 'Price ($)'})

In [47]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size (Mbs)', 'Installs',
       'Type', 'Price ($)', 'Content Rating', 'Genres', 'Last Updated',
       'Current Ver', 'Android Ver'],
      dtype='object')

In [48]:
df = df.dropna(subset=['Rating'])

In [49]:
df.shape

(8893, 13)

In [50]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size (Mbs)        0
Installs          0
Type              0
Price ($)         0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       4
Android Ver       2
dtype: int64

In [53]:
df = df.dropna(subset=['Current Ver'])
df = df.dropna(subset=['Android Ver'])

In [54]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size (Mbs)        0
Installs          0
Type              0
Price ($)         0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

In [55]:
df.sample(34)

Unnamed: 0,App,Category,Rating,Reviews,Size (Mbs),Installs,Type,Price ($),Content Rating,Genres,Last Updated,Current Ver,Android Ver
6270,Sisense Mobile BI,BUSINESS,4.9,23,28.0,1000,Free,0.0,Everyone,Business,"April 30, 2018",2.4.1,4.1 and up
10114,Hey That's Pretty Good!,TOOLS,4.6,222,10.0,10000,Free,0.0,Everyone,Tools,"July 29, 2016",4.2.0,2.3 and up
3180,Cheap Flights & Hotels momondo,TRAVEL_AND_LOCAL,4.2,42546,24.0,5000000,Free,0.0,Everyone,Travel & Local,"April 25, 2018",7.6.5,4.4 and up
10058,Moonlight GO Weather EX,WEATHER,4.2,11510,3.5,1000000,Free,0.0,Everyone,Weather,"August 28, 2014",1.4,2.2 and up
1037,KudaGo - things to do in NY,EVENTS,4.4,4298,4.4,100000,Free,0.0,Everyone,Events,"April 27, 2017",2.2.2,4.0 and up
7069,Restaurantführer Südbaden,TRAVEL_AND_LOCAL,4.1,46,12.0,1000,Free,0.0,Everyone,Travel & Local,"January 11, 2017",2.0.0,4.0 and up
6001,bd's Mongolian Grill,LIFESTYLE,4.2,120,14.0,10000,Free,0.0,Everyone,Lifestyle,"February 23, 2018",1.5.1,4.1 and up
4297,Kpop Music Quiz (K-pop Game),FAMILY,4.2,6418,59.0,100000,Free,0.0,Everyone,Casual,"May 8, 2017",1.3,4.1 and up
5038,AF Link,NEWS_AND_MAGAZINES,4.7,210,18.0,10000,Free,0.0,Everyone,News & Magazines,"May 31, 2018",v4.29.0.9,4.4 and up
2962,MLB At Bat,SPORTS,4.2,82882,0.0,5000000,Free,0.0,Everyone,Sports,"July 30, 2018",Varies with device,Varies with device
