In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/google-play-store-apps/license.txt
/kaggle/input/google-play-store-apps/googleplaystore_user_reviews.csv
/kaggle/input/google-play-store-apps/googleplaystore.csv


# 1. Dataset


We will be using a public data set in Kaggle in csv format. This Dataset is Google Play Store Apps.

To import the dataset to the current environment do the following steps:

Click "+ Add Data" in the top right corner of this Kaggle notebook.
Search "Google Play" in the search box.
Click Add on the first record (uploaded by Lavanda Gupta).
There will be a new folder in /kaggle/input/ containing the dataset.
Jupyter Magic Commands: You can execute linux shell commands in this notebook by preceding the command with '!' character.

to list files in the current directory you can execute in a cell:

!ls -la

In [None]:
!pwd

## Importing libraries

In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# 2. Acquire data

In [3]:
# First challenge: Try to read the csv files and store them in 2 diferent dataframes dataframe
df_reviews = pd.read_csv("../input/google-play-store-apps/googleplaystore_user_reviews.csv")
df_apps = pd.read_csv("../input/google-play-store-apps/googleplaystore.csv")



# 3. Intro: Wrangle, prepare, cleanse the data.

## Explore data

We will be focusing on the Apps dataset, leaving the Reviews one for the more advanced training

In [None]:
# Visualizing first 5 rows of the dataset
df_apps.head()

In [None]:
# Try to visualize now 10 rows instead
df_apps.head(10)

In [None]:
# Dimensions of the apps dataframe: 13 columns, 10841 samples
df_apps.shape

In [None]:
# Types of each column
df_apps.info()

In [None]:
# Statistics of the dataframe. Only numerical columns are taken into account
df_apps.describe()

In [None]:
# To list all the colums
df_apps.columns

In [None]:
# if we want to rename a column df_apps.rename(columns={'App':'App Name'}, inplace=True)

It's often useful to delete (drop) the columns that we are not going to use. We free up memory and also analytical complexity.

In [None]:
#Let's drop for instance the version of the app, which for our analysis is not going to be useful.
df_apps = df_apps.drop("Current Ver", axis=1)

#The axis parameter determines whether we are dropping a column (1) or a row (0). 0 is default.

# This is equivalent to use the parameter inplace = True. In this way it will get saved in the dataframe directly
# df_apps.drop("Current Ver", axis=1, inplace=True)

In [None]:
df_apps.head()

In [None]:
# How many unique values has Category column have?
df_apps.Category.nunique()

In [None]:
#List the unique values for Category
df_apps.Category.unique()

In [None]:
#Alternative way to refer to a Pandas Series
df_apps['Category'].unique()

In [None]:
# If we want to see all the rows that comply with certain condition, we can use loc
df_apps.loc[df_apps.Category == '1.9']

In [None]:
# Another example
df_apps.loc[df_apps.Type == 'Free']

In [None]:
df_apps['Type'].unique()

In [None]:
# We can also select certain rows with iloc
df_apps.iloc[:5]

In [None]:
# Use iloc too select row 10472
df_apps.iloc[10472]

In [None]:
#We need to drop this sample as it's clearly corrupted.
df_apps = df_apps.drop(10472).reset_index().drop("index", axis=1)

In [None]:
df_apps.info()

In [None]:
#Let's identify more corrupt entries with data that we know should not be possible

#First we create the mask
mask = df_apps.Rating > 4
mask

In [None]:
df_apps.loc[mask]

In [None]:
# What are all the possible values of Size and Installs?
df_apps['Size'].unique()


In [None]:
#What percentage of apps are free?
# Your code here

In [None]:
# Drop "Android Ver" and "Genres" column.


In [None]:
# Let's see what fields are included in the other dataframe
df_reviews.columns

## 4. Analyze, identify patterns, and explore the data.

### Type

In our previous analysis (and also, because we have phones and apps ourselves) we know there are 2 types of app: Free and Paid. Not many people like to pay for apps, so we also analyzed that there are many more Free apps in the marketplace than paid. Let's visualize it.

In [None]:
df_apps.head(1)

In [None]:
df_apps['Type'].value_counts()

In [None]:
# Let's have a look at the total number of applications that are free vs paid
df_apps['Type'].value_counts().plot(kind="bar")

In [None]:
#Let's also go into the Rating of our apps. We have been tasked to plot the average rating of our apps per each of the categories
#First, let's select the subset of our dataframe that we are interested in.
df_cat_rating = df_apps[['Category','Rating']]
df_cat_rating.head()

In [None]:
df_cat_rating

In [None]:
#Now let's calculate the mean for each 
df_rating_per_category = df_cat_rating.groupby('Category').mean()
df_rating_per_category.head(30)

In [None]:
#Let's plot it now to see what's the result
df_rating_per_category.plot(kind='bar')

What issue(s) do we see in this visualization?

In [None]:
# Let's improve the plot
df_rating_per_category_ordered = df_rating_per_category.sort_values("Rating")
df_rating_per_category_ordered.plot(kind="bar")
'''Still we have some issues.:
      1.- Most apps' ratings range from around 3.8 to 4.8.
      2.- The chart itself is very squished. We need more space. ''';

In [None]:
# And even some more adjustements:
ax1 = df_rating_per_category_ordered.plot(kind='bar', figsize=(10,5))
ax1.set_ylim(3.5,5)

Let's see a different type of plot: the line plot.

In [None]:
#Casting into datetime
df_apps['Last Updated'] = pd.to_datetime(df_apps['Last Updated'])
df_apps.info()

In [None]:
#Example of a lineplot. We see that the apps in our dataset were updated usually by a few months of the collection of the data (end of the lineplot)

df_apps[['Last Updated']].groupby([df_apps['Last Updated'].dt.year, df_apps['Last Updated'].dt.month]).count().plot(kind='line', figsize=(15, 5))


## 5. Data transformations.

In [None]:
#Let's focus in the size of the app. If we are going to use this data in the future or quantify the size somehow, we need to do some arrangements.
df_apps[['Size']].head(15)

In [None]:
df_apps['Size'].unique()

In [None]:
#Let's simply cast it into an integer, and it should be fine.
df_apps.astype({'Size':'int32'})

In [None]:
#... right? As we saw up there, we need to do some arrangements.
# First, we need to make our strings 'look like' numbers.

#We present you guys with the fuction apply:
df_apps['Size'] = df_apps['Size'].apply(size_to_int)

In [None]:
def size_to_int(n):
    lastchar = n[-1:]
    if 'M' == lastchar:
        n = float(n[:-1])*1000
    elif 'k' == lastchar:
        n = float(n[:-1])
    else:
        n = None
    return n

In [None]:
df_apps

In [None]:
df_apps.info()

In [None]:
#We can see how the average size of apps has increased consistently with the years.

df_apps[['Last Updated', 'Size']]\
    .groupby([df_apps['Last Updated'].dt.year, df_apps['Last Updated'].dt.month]).mean()\
    .plot(kind='line', figsize=(15, 5))


In [None]:
#We should check if we have duplicates in our dataset and get rid of them.
duplicated_mask = df_apps.duplicated()
df_apps[duplicated_mask]

In [None]:
df_apps[df_apps['App']=='TickTick: To Do List with Reminder, Day Planner']

In [None]:
#Let's drop the duplicates
df_apps = df_apps.drop_duplicates()

In [None]:
df_apps.head()

In [None]:
df_apps.info()

# **HOMEWORK**

In [4]:
#Exercise 1
df_apps.loc[df_apps.Rating < 4]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19M,"50,000+",Free,0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up
15,Learn To Draw Kawaii Characters,ART_AND_DESIGN,3.2,55,2.7M,"5,000+",Free,0,Everyone,Art & Design,"June 6, 2018",,4.2 and up
28,Pencil Sketch Drawing,ART_AND_DESIGN,3.9,136,4.6M,"10,000+",Free,0,Everyone,Art & Design,"July 12, 2018",6.0,2.3 and up
35,How to draw Ladybug and Cat Noir,ART_AND_DESIGN,3.8,564,9.2M,"100,000+",Free,0,Everyone,Art & Design,"July 11, 2018",2.1,4.1 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10782,Trine 2: Complete Story,GAME,3.8,252,11M,"10,000+",Paid,$16.99,Teen,Action,"February 27, 2015",2.22,5.0 and up
10819,Fanfic-FR,BOOKS_AND_REFERENCE,3.3,52,3.6M,"5,000+",Free,0,Teen,Books & Reference,"August 5, 2017",0.3.4,4.1 and up
10828,Manga-FR - Anime Vostfr,COMICS,3.4,291,13M,"10,000+",Free,0,Everyone,Comics,"May 15, 2017",2.0.1,4.0 and up
10830,News Minecraft.fr,NEWS_AND_MAGAZINES,3.8,881,2.3M,"100,000+",Free,0,Everyone,News & Magazines,"January 20, 2014",1.5,1.6 and up


In [5]:
#Exercise 2
df_apps = df_apps.drop("Android Ver", axis=1)
df_apps = df_apps.drop("Genres", axis=1)

In [6]:
#Exercise 3
for x in df_apps.columns:
    print(x + ": " + str(df_apps[x].nunique()))

App: 9660
Category: 34
Rating: 40
Reviews: 6002
Size: 462
Installs: 22
Type: 3
Price: 93
Content Rating: 6
Genres: 120
Last Updated: 1378
Current Ver: 2832
Android Ver: 33


In [None]:
#Exercise 4
df_apps = df_apps.drop(10472).reset_index().drop("index", axis=1)
ax1 = df_apps[['Category','Rating']].groupby('Category').mean().sort_values("Rating").plot(kind='bar', figsize=(10,5)).set_ylim(3.5,5)