In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#0 Read and explore the data
playstore = pd.read_csv("googleplaystore.csv")
review = pd.read_csv("googleplaystore_user_reviews.csv")

In [3]:
playstore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
App               10841 non-null object
Category          10841 non-null object
Rating            9367 non-null float64
Reviews           10841 non-null object
Size              10841 non-null object
Installs          10841 non-null object
Type              10840 non-null object
Price             10841 non-null object
Content Rating    10840 non-null object
Genres            10841 non-null object
Last Updated      10841 non-null object
Current Ver       10833 non-null object
Android Ver       10838 non-null object
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [4]:
playstore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [5]:
#1 Convert the app sizes to a number
def replace(string):
    try:
        if ("." in string):
            return int(string.replace(".","").replace("M","00000").replace("k","000"))
        else:
            return int(string.replace("M","000000").replace("k","000"))
    except:
        return np.nan

print(playstore["Size"].head())
playstore["Size"] = playstore.apply(lambda row:replace(row["Size"]),axis=1)
print("\n")
print(playstore["Size"].head())
#There are missing values: 'Varies with device' cannot be converted as a number
print("\nMissing values: {}".format(playstore.Size.isnull().sum()))

0     19M
1     14M
2    8.7M
3     25M
4    2.8M
Name: Size, dtype: object


0    19000000.0
1    14000000.0
2     8700000.0
3    25000000.0
4     2800000.0
Name: Size, dtype: float64

Missing values: 1696


In [6]:
#2 Convert the number of installs to a number
import string
translator = str.maketrans('', '', string.punctuation)
def convert(string):
    try:
        return float(string.translate(translator))
    except:
        return np.nan
print(playstore["Installs"].head())
playstore["Installs"] = playstore.apply(lambda row:convert(row["Installs"]),axis=1)
print("\n")
print(playstore["Installs"].head())
#There are missing values: 'Free' cannot be convert to number
print("\nMissing values: {}".format(playstore.Installs.isnull().sum()))

0        10,000+
1       500,000+
2     5,000,000+
3    50,000,000+
4       100,000+
Name: Installs, dtype: object


0       10000.0
1      500000.0
2     5000000.0
3    50000000.0
4      100000.0
Name: Installs, dtype: float64

Missing values: 1


In [7]:
#3 Transform “Varies with device” into a missing value
playstore = playstore.replace("Varies with device",np.nan)

In [8]:
#4 Convert Current Ver and Android Ver into a dotted number (e.g. 4.0.3 or 4.2)
import re
def clean0(string):
    try:
        return re.compile("\d+[\d\.]+\d").match(str(string)).group()
    except:
        return np.nan

print(playstore["Android Ver"].head())
print(playstore["Current Ver"].head())
playstore["Android Ver"] = playstore.apply(lambda row:clean0(row["Android Ver"]),axis=1)
playstore["Current Ver"] = playstore.apply(lambda row:clean0(row["Current Ver"]),axis=1)
print("\n")
print(playstore["Android Ver"].head())
print(playstore["Current Ver"].head())

0    4.0.3 and up
1    4.0.3 and up
2    4.0.3 and up
3      4.2 and up
4      4.4 and up
Name: Android Ver, dtype: object
0    1.0.0
1    2.0.0
2    1.2.4
3      NaN
4      1.1
Name: Current Ver, dtype: object


0    4.0.3
1    4.0.3
2    4.0.3
3      4.2
4      4.4
Name: Android Ver, dtype: object
0    1.0.0
1    2.0.0
2    1.2.4
3      NaN
4      1.1
Name: Current Ver, dtype: object


In [9]:
#5 Remove the duplicates
playstore = playstore.drop_duplicates(subset=None, keep='first', inplace=False)

In [10]:
#6 For each category, compute the number of apps
playstore[["App"]].groupby(playstore["Category"]).count()

Unnamed: 0_level_0,App
Category,Unnamed: 1_level_1
1.9,1
ART_AND_DESIGN,65
AUTO_AND_VEHICLES,85
BEAUTY,53
BOOKS_AND_REFERENCE,230
BUSINESS,427
COMICS,60
COMMUNICATION,366
DATING,196
EDUCATION,130


In [11]:
#7 For each category, compute the average rating
playstore[["Rating"]].groupby(playstore["Category"]).mean()

Unnamed: 0_level_0,Rating
Category,Unnamed: 1_level_1
1.9,19.0
ART_AND_DESIGN,4.358065
AUTO_AND_VEHICLES,4.190411
BEAUTY,4.278571
BOOKS_AND_REFERENCE,4.347458
BUSINESS,4.102593
COMICS,4.155172
COMMUNICATION,4.151466
DATING,3.971698
EDUCATION,4.375969


In [12]:
#8 Create two dataframes: one for the genres and one bridging apps and genders. So that, for instance, the app Pixel Draw - Number Art Coloring Book appears twice in the bridging table, once for Art & Design, once for Creativity
df = pd.DataFrame(playstore["Genres"],columns=["Genres"])
df.head()
df0 = pd.DataFrame(columns=["App","Genres"])
for i in playstore.values: #oppure df0 = playstore[['App','Genres']].copy()
    for j in i[9].split(";"): 
        x = pd.Series([i[0],j],index=df0.columns)
        df0 = df0.append(x,ignore_index=True)
df0.head()

Unnamed: 0,App,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design
1,Coloring book moana,Art & Design
2,Coloring book moana,Pretend Play
3,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design
4,Sketch - Draw & Paint,Art & Design


In [13]:
#9 For each genre, create a new column of the original dataframe. The new columns must have boolean values (True if the app has a given genre)
playstore["Genres"] = playstore["Genres"].drop_duplicates()
playstore["Given_Genres"] = (df["Genres"].notnull())
playstore.head()

0    True
1    True
2    True
3    True
4    True
Name: Given_Genres, dtype: bool

In [14]:
#10 For each genre, compute the average rating. What is the genre with highest average?
playstore["Rating"].groupby(playstore["Genres"]).mean().nlargest(10)

Genres
February 11, 2018            19.0
Board;Pretend Play            4.8
Comics;Creativity             4.8
Health & Fitness              4.8
Simulation;Pretend Play       4.8
Arcade;Action & Adventure     4.7
Beauty                        4.7
Board                         4.7
Card                          4.7
Casino                        4.7
Name: Rating, dtype: float64

In [15]:
#11 For each app, compute the approximate income, obtain as a product of number of installs and price.
def income(instals, price):
    try:
        return round((int(instals) * float(price.replace("$",""))),2)
    except:
        return np.nan

playstore["Income"] = playstore.apply(lambda row:income(row["Installs"],row["Price"]),axis=1)
playstore["Income"].groupby(playstore["App"]).max().nlargest(10)

App
Minecraft                        69900000.0
I am rich                        39999000.0
I Am Rich Premium                19999500.0
Hitman Sniper                     9900000.0
Grand Theft Auto: San Andreas     6990000.0
Facetune - For Free               5990000.0
Sleep as Android Unlock           5990000.0
DraStic DS Emulator               4990000.0
I'm Rich - Trump Edition          4000000.0
I am Rich Plus                    3999900.0
Name: Income, dtype: float64

In [16]:
#12 For each app, compute its minimum and maximum Sentiment_polarity
review["Sentiment_Polarity"].groupby(review["App"]).max()

App
10 Best Foods for You                                 1.000000
104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室                      0.910000
11st                                                  1.000000
1800 Contacts - Lens Store                            0.838542
1LINE – One Line with One Touch                       1.000000
2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif      1.000000
21-Day Meditation Experience                          0.587500
2Date Dating App, Love and matching                   1.000000
2GIS: directory & navigator                           1.000000
2RedBeans                                             1.000000
2ndLine - Second Phone Number                         1.000000
30 Day Fitness Challenge - Workout at Home            0.900000
365Scores - Live Scores                               1.000000
3D Blue Glass Water Keyboard Theme                         NaN
3D Color Pixel by Number - Sandbox Art Coloring            NaN
3D Live Neon Weed Launcher                         

In [17]:
review["Sentiment_Polarity"].groupby(review["App"]).min()

App
10 Best Foods for You                                -0.800000
104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室                     -0.112500
11st                                                 -1.000000
1800 Contacts - Lens Store                           -0.300000
1LINE – One Line with One Touch                      -0.825000
2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif     -0.800000
21-Day Meditation Experience                         -0.265625
2Date Dating App, Love and matching                  -0.645833
2GIS: directory & navigator                          -0.375000
2RedBeans                                            -0.800000
2ndLine - Second Phone Number                        -0.781250
30 Day Fitness Challenge - Workout at Home           -0.500000
365Scores - Live Scores                               0.000000
3D Blue Glass Water Keyboard Theme                         NaN
3D Color Pixel by Number - Sandbox Art Coloring            NaN
3D Live Neon Weed Launcher                         