# work on gplay

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
#To load dataset
df=pd.read_csv("gplay.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone
3,3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen
4,4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone


In [3]:
#First remove unwanted column:0 permanently  from dataframe df
df.drop("Unnamed: 0",axis=1,inplace=True)
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating'],
      dtype='object')

In [4]:
#check null values
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
dtype: int64

In [5]:
#Find the % of null values
df.isnull().sum()*100/df.shape[0]

App                0.000000
Category           0.000000
Rating            13.596532
Reviews            0.000000
Size               0.000000
Installs           0.000000
Type               0.009224
Price              0.000000
Content Rating     0.009224
dtype: float64

In [6]:
#Check datatype 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
dtypes: float64(1), object(8)
memory usage: 762.4+ KB


In [7]:
#Remove null values and use fillna to fill values  
df["Rating"].fillna(df["Rating"].mean(),inplace=True)

In [8]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
dtype: int64

In [9]:
#To check the type of value
df["Type"].unique()

array(['Free', 'Paid', nan, '0'], dtype=object)

In [10]:
#to check the highest frequence
df["Type"].value_counts()

Free    10039
Paid      800
0           1
Name: Type, dtype: int64

In [11]:
#Replace null value of type column with value "Free" permanently
df["Type"].fillna("Free",inplace=True)

#Replace value 0 of type column with free permanently
df["Type"].replace("0","Free",inplace=True)

In [12]:
#checking the value which we chnage in above
df["Type"].unique()

array(['Free', 'Paid'], dtype=object)

In [13]:
#Again check null values in dataset
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    1
dtype: int64

In [14]:
#which type of value in content rating column
df["Content Rating"].unique()

array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
       'Adults only 18+', 'Unrated', nan], dtype=object)

In [15]:
#to check the highest frequence
df["Content Rating"].value_counts()

Everyone           8714
Teen               1208
Mature 17+          499
Everyone 10+        414
Adults only 18+       3
Unrated               2
Name: Content Rating, dtype: int64

In [16]:
#Fill null values of columns content rating with everyone permanently
df["Content Rating"].fillna("Everyone",inplace=True)

#Replace value unrated of type column with Everyone in content rating column permanently
df["Content Rating"].replace("Unrated","Everyone",inplace=True)

In [17]:
#to check the highest frequence
df["Content Rating"].value_counts()

Everyone           8717
Teen               1208
Mature 17+          499
Everyone 10+        414
Adults only 18+       3
Name: Content Rating, dtype: int64

In [18]:
#Again check null values in dataset
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
dtype: int64

In [19]:
#To check datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          10841 non-null  float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10841 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
dtypes: float64(1), object(8)
memory usage: 762.4+ KB


In [20]:
#check the value of review
for data in df["Reviews"]:
    if not(str(data).isnumeric()):
        print(data)

3.0M


In [21]:
#Replace 3.0M with null values in Review columns permanently
df["Reviews"].replace("3.0M",np.nan,inplace=True)

In [22]:
#check n ull values of dataset
df["Reviews"].isnull().sum()

1

In [23]:
#To change the datatype of reviews column from object to float permanently
df["Reviews"]=df["Reviews"].astype("float")
df["Reviews"].dtype

dtype('float64')

In [24]:
#fill null values with mean of Reviews in Reviews column permanently
df["Reviews"].fillna(df["Reviews"].mean(),inplace=True)

In [25]:
#Check null values of dataset
df["Reviews"].isnull().sum()

0

In [26]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19M,"10,000+",Free,0,Everyone
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14M,"500,000+",Free,0,Everyone
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7M,"5,000,000+",Free,0,Everyone
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25M,"50,000,000+",Free,0,Teen
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8M,"100,000+",Free,0,Everyone


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          10841 non-null  float64
 3   Reviews         10841 non-null  float64
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10841 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
dtypes: float64(2), object(7)
memory usage: 762.4+ KB


In [28]:
#Checking which type of value in size column
df["Size"].unique()

array(['19M', '14M', '8.7M', '25M', '2.8M', '5.6M', '29M', '33M', '3.1M',
       '28M', '12M', '20M', '21M', '37M', '2.7M', '5.5M', '17M', '39M',
       '31M', '4.2M', '7.0M', '23M', '6.0M', '6.1M', '4.6M', '9.2M',
       '5.2M', '11M', '24M', 'Varies with device', '9.4M', '15M', '10M',
       '1.2M', '26M', '8.0M', '7.9M', '56M', '57M', '35M', '54M', '201k',
       '3.6M', '5.7M', '8.6M', '2.4M', '27M', '2.5M', '16M', '3.4M',
       '8.9M', '3.9M', '2.9M', '38M', '32M', '5.4M', '18M', '1.1M',
       '2.2M', '4.5M', '9.8M', '52M', '9.0M', '6.7M', '30M', '2.6M',
       '7.1M', '3.7M', '22M', '7.4M', '6.4M', '3.2M', '8.2M', '9.9M',
       '4.9M', '9.5M', '5.0M', '5.9M', '13M', '73M', '6.8M', '3.5M',
       '4.0M', '2.3M', '7.2M', '2.1M', '42M', '7.3M', '9.1M', '55M',
       '23k', '6.5M', '1.5M', '7.5M', '51M', '41M', '48M', '8.5M', '46M',
       '8.3M', '4.3M', '4.7M', '3.3M', '40M', '7.8M', '8.8M', '6.6M',
       '5.1M', '61M', '66M', '79k', '8.4M', '118k', '44M', '695k', '1.6M',
     

In [29]:
#To remove M from right in size column permanently
df["Size"]=df["Size"].map(lambda X:X.strip("M"))

In [30]:
#Checking which type of value in size column
df["Size"].unique()

array(['19', '14', '8.7', '25', '2.8', '5.6', '29', '33', '3.1', '28',
       '12', '20', '21', '37', '2.7', '5.5', '17', '39', '31', '4.2',
       '7.0', '23', '6.0', '6.1', '4.6', '9.2', '5.2', '11', '24',
       'Varies with device', '9.4', '15', '10', '1.2', '26', '8.0', '7.9',
       '56', '57', '35', '54', '201k', '3.6', '5.7', '8.6', '2.4', '27',
       '2.5', '16', '3.4', '8.9', '3.9', '2.9', '38', '32', '5.4', '18',
       '1.1', '2.2', '4.5', '9.8', '52', '9.0', '6.7', '30', '2.6', '7.1',
       '3.7', '22', '7.4', '6.4', '3.2', '8.2', '9.9', '4.9', '9.5',
       '5.0', '5.9', '13', '73', '6.8', '3.5', '4.0', '2.3', '7.2', '2.1',
       '42', '7.3', '9.1', '55', '23k', '6.5', '1.5', '7.5', '51', '41',
       '48', '8.5', '46', '8.3', '4.3', '4.7', '3.3', '40', '7.8', '8.8',
       '6.6', '5.1', '61', '66', '79k', '8.4', '118k', '44', '695k',
       '1.6', '6.2', '18k', '53', '1.4', '3.0', '5.8', '3.8', '9.6', '45',
       '63', '49', '77', '4.4', '4.8', '70', '6.9', '9.3', '1

In [31]:
#To remove k in Size column after that change the unit of size column from
#k to M divided by 1024
# 1 MB=1024KB
df["Size"]=df["Size"].map(lambda X :str(round((float(X.rstrip('k'))/1024),1)) 
                          if X[-1]=='k' else X)

In [32]:
df["Size"].unique()

array(['19', '14', '8.7', '25', '2.8', '5.6', '29', '33', '3.1', '28',
       '12', '20', '21', '37', '2.7', '5.5', '17', '39', '31', '4.2',
       '7.0', '23', '6.0', '6.1', '4.6', '9.2', '5.2', '11', '24',
       'Varies with device', '9.4', '15', '10', '1.2', '26', '8.0', '7.9',
       '56', '57', '35', '54', '0.2', '3.6', '5.7', '8.6', '2.4', '27',
       '2.5', '16', '3.4', '8.9', '3.9', '2.9', '38', '32', '5.4', '18',
       '1.1', '2.2', '4.5', '9.8', '52', '9.0', '6.7', '30', '2.6', '7.1',
       '3.7', '22', '7.4', '6.4', '3.2', '8.2', '9.9', '4.9', '9.5',
       '5.0', '5.9', '13', '73', '6.8', '3.5', '4.0', '2.3', '7.2', '2.1',
       '42', '7.3', '9.1', '55', '0.0', '6.5', '1.5', '7.5', '51', '41',
       '48', '8.5', '46', '8.3', '4.3', '4.7', '3.3', '40', '7.8', '8.8',
       '6.6', '5.1', '61', '66', '0.1', '8.4', '44', '0.7', '1.6', '6.2',
       '53', '1.4', '3.0', '5.8', '3.8', '9.6', '45', '63', '49', '77',
       '4.4', '4.8', '70', '6.9', '9.3', '10.0', '8.1', '36'

In [33]:
#to replace Varies with device np.nan
df["Size"].replace("Varies with device",np.nan,inplace=True)
df["Size"].replace('1,000+',np.nan,inplace=True)

In [34]:
#Change yhe datatype of size column from object to float
df["Size"]=df["Size"].astype(float)
df["Size"].isnull().sum()

1696

In [35]:
#Find % of null values in Size column
df["Size"].isnull().sum()*100/df.shape[0]

15.644313255234756

In [36]:
#Fill null values of size column with mean of size column
df["Size"].fillna(df["Size"].mean(),inplace=True)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          10841 non-null  float64
 3   Reviews         10841 non-null  float64
 4   Size            10841 non-null  float64
 5   Installs        10841 non-null  object 
 6   Type            10841 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
dtypes: float64(3), object(6)
memory usage: 762.4+ KB


In [38]:
# check the type of value in Installs
df["Installs"].unique()

array(['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+',
       '50,000+', '1,000,000+', '10,000,000+', '5,000+', '100,000,000+',
       '1,000,000,000+', '1,000+', '500,000,000+', '50+', '100+', '500+',
       '10+', '1+', '5+', '0+', '0', 'Free'], dtype=object)

In [39]:
#tom remove all + from rows in Installs column
#To remove + from right in Installs column permanently
df["Installs"]=df["Installs"].map(lambda X:X.rstrip("+"))
df["Installs"].unique()

array(['10,000', '500,000', '5,000,000', '50,000,000', '100,000',
       '50,000', '1,000,000', '10,000,000', '5,000', '100,000,000',
       '1,000,000,000', '1,000', '500,000,000', '50', '100', '500', '10',
       '1', '5', '0', 'Free'], dtype=object)

In [40]:
#To remove comma from Installs column
df["Installs"]=df["Installs"].map(lambda X: ''.join(X.split(',')))
df["Installs"].unique()

array(['10000', '500000', '5000000', '50000000', '100000', '50000',
       '1000000', '10000000', '5000', '100000000', '1000000000', '1000',
       '500000000', '50', '100', '500', '10', '1', '5', '0', 'Free'],
      dtype=object)

In [41]:
#replace free with null vaules
df["Installs"].replace("Free",np.nan,inplace=True)
df["Installs"].isnull().sum()

1

In [42]:
#to change datatype 
df["Installs"]=df["Installs"].astype("float")

In [43]:
df["Installs"].fillna(df["Installs"].mean(),inplace=True)

In [44]:
df["Installs"].unique()

array([1.00000000e+04, 5.00000000e+05, 5.00000000e+06, 5.00000000e+07,
       1.00000000e+05, 5.00000000e+04, 1.00000000e+06, 1.00000000e+07,
       5.00000000e+03, 1.00000000e+08, 1.00000000e+09, 1.00000000e+03,
       5.00000000e+08, 5.00000000e+01, 1.00000000e+02, 5.00000000e+02,
       1.00000000e+01, 1.00000000e+00, 5.00000000e+00, 0.00000000e+00,
       1.54643389e+07])

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          10841 non-null  float64
 3   Reviews         10841 non-null  float64
 4   Size            10841 non-null  float64
 5   Installs        10841 non-null  float64
 6   Type            10841 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
dtypes: float64(4), object(5)
memory usage: 762.4+ KB


In [46]:
df["Price"].unique()  

array(['0', '$4.99', '$3.99', '$6.99', '$1.49', '$2.99', '$7.99', '$5.99',
       '$3.49', '$1.99', '$9.99', '$7.49', '$0.99', '$9.00', '$5.49',
       '$10.00', '$24.99', '$11.99', '$79.99', '$16.99', '$14.99',
       '$1.00', '$29.99', '$12.99', '$2.49', '$10.99', '$1.50', '$19.99',
       '$15.99', '$33.99', '$74.99', '$39.99', '$3.95', '$4.49', '$1.70',
       '$8.99', '$2.00', '$3.88', '$25.99', '$399.99', '$17.99',
       '$400.00', '$3.02', '$1.76', '$4.84', '$4.77', '$1.61', '$2.50',
       '$1.59', '$6.49', '$1.29', '$5.00', '$13.99', '$299.99', '$379.99',
       '$37.99', '$18.99', '$389.99', '$19.90', '$8.49', '$1.75',
       '$14.00', '$4.85', '$46.99', '$109.99', '$154.99', '$3.08',
       '$2.59', '$4.80', '$1.96', '$19.40', '$3.90', '$4.59', '$15.46',
       '$3.04', '$4.29', '$2.60', '$3.28', '$4.60', '$28.99', '$2.95',
       '$2.90', '$1.97', '$200.00', '$89.99', '$2.56', '$30.99', '$3.61',
       '$394.99', '$1.26', 'Everyone', '$1.20', '$1.04'], dtype=object)

In [47]:
df["Price"]=df["Price"].map(lambda X:X.lstrip("$"))
df["Price"].unique()

array(['0', '4.99', '3.99', '6.99', '1.49', '2.99', '7.99', '5.99',
       '3.49', '1.99', '9.99', '7.49', '0.99', '9.00', '5.49', '10.00',
       '24.99', '11.99', '79.99', '16.99', '14.99', '1.00', '29.99',
       '12.99', '2.49', '10.99', '1.50', '19.99', '15.99', '33.99',
       '74.99', '39.99', '3.95', '4.49', '1.70', '8.99', '2.00', '3.88',
       '25.99', '399.99', '17.99', '400.00', '3.02', '1.76', '4.84',
       '4.77', '1.61', '2.50', '1.59', '6.49', '1.29', '5.00', '13.99',
       '299.99', '379.99', '37.99', '18.99', '389.99', '19.90', '8.49',
       '1.75', '14.00', '4.85', '46.99', '109.99', '154.99', '3.08',
       '2.59', '4.80', '1.96', '19.40', '3.90', '4.59', '15.46', '3.04',
       '4.29', '2.60', '3.28', '4.60', '28.99', '2.95', '2.90', '1.97',
       '200.00', '89.99', '2.56', '30.99', '3.61', '394.99', '1.26',
       'Everyone', '1.20', '1.04'], dtype=object)

In [48]:
df["Price"].replace("Everyone",np.nan,inplace=True)
df["Price"].isnull().sum()

1

In [49]:
df["Price"]=df["Price"].astype("float")

In [50]:
df["Price"].dtype

dtype('float64')

In [51]:
df["Price"].fillna(df["Price"].mean(),inplace=True)

In [52]:
df["Price"].unique()

array([  0.        ,   4.99      ,   3.99      ,   6.99      ,
         1.49      ,   2.99      ,   7.99      ,   5.99      ,
         3.49      ,   1.99      ,   9.99      ,   7.49      ,
         0.99      ,   9.        ,   5.49      ,  10.        ,
        24.99      ,  11.99      ,  79.99      ,  16.99      ,
        14.99      ,   1.        ,  29.99      ,  12.99      ,
         2.49      ,  10.99      ,   1.5       ,  19.99      ,
        15.99      ,  33.99      ,  74.99      ,  39.99      ,
         3.95      ,   4.49      ,   1.7       ,   8.99      ,
         2.        ,   3.88      ,  25.99      , 399.99      ,
        17.99      , 400.        ,   3.02      ,   1.76      ,
         4.84      ,   4.77      ,   1.61      ,   2.5       ,
         1.59      ,   6.49      ,   1.29      ,   5.        ,
        13.99      , 299.99      , 379.99      ,  37.99      ,
        18.99      , 389.99      ,  19.9       ,   8.49      ,
         1.75      ,  14.        ,   4.85      ,  46.99

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          10841 non-null  float64
 3   Reviews         10841 non-null  float64
 4   Size            10841 non-null  float64
 5   Installs        10841 non-null  float64
 6   Type            10841 non-null  object 
 7   Price           10841 non-null  float64
 8   Content Rating  10841 non-null  object 
dtypes: float64(5), object(4)
memory usage: 762.4+ KB


In [None]:
#le