# Importing Libraries

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Reading file

In [2]:
df = pd.read_csv("data/google_play_apps_data.csv")
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


# Getting insights to the data

###### shape attribute of the dataframe lets you know the shape of the dataframe i.e. number of rows and columns

In [3]:
df.shape

(10841, 13)

##### columns attribute prints name of all columns present in the dataframe

In [4]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

##### dtypes attribute lets know the data type of each and every column present in the dataframe

In [5]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

##### isnull() function shows whether particular enrty is null or not and returns a boolean value
##### sum() function shows the total number of null values in particular column
##### We can use this method either on complete dataframe or even on any single column

In [6]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

##### Get a complete outlining information of the dataset

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


##### describe() function describes the various statistical parameters like mean, quantiles etc of numerical columns in a dataframe

In [8]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


### Dropping the columns that are not required pr which have a little less significance than others for further analysis

##### Using slicing to drop columns

In [9]:
sliced_df = df.iloc[:,:-3]
sliced_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity


##### Using drop functionality of pandas

In [10]:
dropped_df = df.drop(["Last Updated","Current Ver","Android Ver"],axis=1)
dropped_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity


##### Taking in only required columns for analysis

In [11]:
required_df = df[['Category','App','Installs','Rating','Reviews','Type']]

##### Improving column entries to remove punctuations and operators for system to understand the data more better

In [12]:
required_df['Installs'] = required_df['Installs'].str.replace('+','')
required_df['Installs'] = required_df['Installs'].str.replace(',','')

In [13]:
required_df.head()

Unnamed: 0,Category,App,Installs,Rating,Reviews,Type
0,ART_AND_DESIGN,Photo Editor & Candy Camera & Grid & ScrapBook,10000,4.1,159,Free
1,ART_AND_DESIGN,Coloring book moana,500000,3.9,967,Free
2,ART_AND_DESIGN,"U Launcher Lite – FREE Live Cool Themes, Hide ...",5000000,4.7,87510,Free
3,ART_AND_DESIGN,Sketch - Draw & Paint,50000000,4.5,215644,Free
4,ART_AND_DESIGN,Pixel Draw - Number Art Coloring Book,100000,4.3,967,Free


##### Checking out number of unique elements in complete dataframe

In [14]:
df.nunique()

App               9660
Category            34
Rating              40
Reviews           6002
Size               462
Installs            22
Type                 3
Price               93
Content Rating       6
Genres             120
Last Updated      1378
Current Ver       2832
Android Ver         33
dtype: int64

##### Checking occurence of each category in Category column

In [15]:
required_df.Category.value_counts()

FAMILY                 1972
GAME                   1144
TOOLS                   843
MEDICAL                 463
BUSINESS                460
PRODUCTIVITY            424
PERSONALIZATION         392
COMMUNICATION           387
SPORTS                  384
LIFESTYLE               382
FINANCE                 366
HEALTH_AND_FITNESS      341
PHOTOGRAPHY             335
SOCIAL                  295
NEWS_AND_MAGAZINES      283
SHOPPING                260
TRAVEL_AND_LOCAL        258
DATING                  234
BOOKS_AND_REFERENCE     231
VIDEO_PLAYERS           175
EDUCATION               156
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     137
FOOD_AND_DRINK          127
HOUSE_AND_HOME           88
LIBRARIES_AND_DEMO       85
AUTO_AND_VEHICLES        85
WEATHER                  82
ART_AND_DESIGN           65
EVENTS                   64
PARENTING                60
COMICS                   60
BEAUTY                   53
1.9                       1
Name: Category, dtype: int64

### Grouping

##### Groupby to see the count of each category for every column

In [16]:
required_df.groupby("Category").count().head()

Unnamed: 0_level_0,App,Installs,Rating,Reviews,Type
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.9,1,1,1,1,1
ART_AND_DESIGN,65,65,62,65,65
AUTO_AND_VEHICLES,85,85,73,85,85
BEAUTY,53,53,42,53,53
BOOKS_AND_REFERENCE,231,231,178,231,231


##### Making group on the basis of two columns and then checking count

In [17]:
required_df.groupby(['Category','Type']).count().head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,App,Installs,Rating,Reviews
Category,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.9,0,1,1,1,1
ART_AND_DESIGN,Free,62,62,59,62
ART_AND_DESIGN,Paid,3,3,3,3
AUTO_AND_VEHICLES,Free,82,82,72,82
AUTO_AND_VEHICLES,Paid,3,3,1,3
BEAUTY,Free,53,53,42,53
BOOKS_AND_REFERENCE,Free,203,203,170,203
BOOKS_AND_REFERENCE,Paid,28,28,8,28
BUSINESS,Free,446,446,292,446
BUSINESS,Paid,14,14,11,14


##### Viewing first entry of each category

In [18]:
required_df.groupby("Category").first().head()

Unnamed: 0_level_0,App,Installs,Rating,Reviews,Type
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.9,Life Made WI-Fi Touchscreen Photo Frame,Free,19.0,3.0M,0
ART_AND_DESIGN,Photo Editor & Candy Camera & Grid & ScrapBook,10000,4.1,159,Free
AUTO_AND_VEHICLES,Monster Truck Stunt 3D 2019,100000,4.2,367,Free
BEAUTY,Hush - Beauty for Everyone,500000,4.7,18900,Free
BOOKS_AND_REFERENCE,Wattpad 📖 Free Books,100000000,4.6,2914724,Free


### Replacing nan values

##### Filling with string

In [19]:
required_df['Type'].fillna('Free',inplace=True)

###### Filling wth mean

In [20]:
required_df['Rating'].fillna((required_df['Rating'].mean()), inplace=True)

##### Dropping rows that have nan values in them

In [21]:
required_df.dropna(inplace=True)

##### get_dummies() function helps in creating dummy columns to make complete dataframe in integer form as some ml models are not compatible to string data type so now after converting the dataframe can be processed through the algorithm

In [22]:
df = pd.get_dummies(required_df)

In [23]:
df.head()

Unnamed: 0,Rating,Category_1.9,Category_ART_AND_DESIGN,Category_AUTO_AND_VEHICLES,Category_BEAUTY,Category_BOOKS_AND_REFERENCE,Category_BUSINESS,Category_COMICS,Category_COMMUNICATION,Category_DATING,...,Reviews_996,Reviews_9966,Reviews_997,Reviews_9971,Reviews_9975,Reviews_999,Reviews_9992,Type_0,Type_Free,Type_Paid
0,4.1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3.9,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,4.7,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4.5,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4.3,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Above obtained dataframe can be fed into the machine learning algorithm now for prediction

# Thank You!