In [1]:
import pandas as pd
import numpy as np
import re

In [89]:
googlestore = pd.read_csv('googleplaystore.csv')
polarity =  pd.read_csv('googleplaystore_user_reviews.csv')

In [3]:
#record 10472 has no 'Category' value. Proceed with removal
googlestore=googlestore.drop(10472)

In [4]:
size_to_num = re.compile('(?P<number>\d+\.{0,1}\d*)(?P<prefix>\w*)')
#In this function we use lowercase k for kilo and uppercase M and G for Mega and Giga. 
#We Assume no file with any other unit otherwise the unit will be ignored.
def prefix_to_mult(unit):
    if unit == 'G':
        return 1000000000
    if unit == 'M':
        return 1000000
    if unit == 'k':
        return 1000
    return 1

#This function can understand the format of the data given to it using regex,
#If the given data is not in the expected format it will return 'Not a Number'
#For example 'Varies with device' is transformed in NaN
def ConvertSizeToByte(Size):
    searched = size_to_num.search(Size)
    if searched is None:
        return np.nan
    else:
        prefix = searched.group('prefix')
        mult = prefix_to_mult(prefix)
        result = float(searched.group('number'))
        return int(result*mult)

In [5]:
googlestore['SizeInBytes'] = googlestore['Size'].apply(ConvertSizeToByte)


In [6]:
display("googlestore[['App','Size','SizeInBytes']].tail()") #Use tail because it shows a NaN


"googlestore[['App','Size','SizeInBytes']].tail()"

In [7]:
#Convert the number of installs to a number

googlestore['Installs'] = googlestore['Installs'].str.replace(",", "") 
googlestore['Installs'] = googlestore['Installs'].str.replace(" ", "") 
googlestore['Installs'] = googlestore['Installs'].str.replace("[^0-9]", "") 


googlestore['Installs']


0           10000
1          500000
2         5000000
3        50000000
4          100000
5           50000
6           50000
7         1000000
8         1000000
9           10000
10        1000000
11        1000000
12       10000000
13         100000
14         100000
15           5000
16         500000
17          10000
18        5000000
19       10000000
20         100000
21         100000
22         500000
23         100000
24          50000
25          10000
26         500000
27         100000
28          10000
29         100000
           ...   
10811         100
10812        1000
10813       10000
10814       50000
10815      500000
10816         100
10817      100000
10818       10000
10819        5000
10820        1000
10821          50
10822          10
10823         100
10824       10000
10825         100
10826     5000000
10827        5000
10828       10000
10829       10000
10830      100000
10831        5000
10832      100000
10833        1000
10834         500
10835     

In [8]:
#Transform “Varies with device” into a missing value
googlestore = googlestore.replace("Varies with device", np.nan, regex = True)


In [9]:
#Convert Current Ver and Android Ver into a dotted number (e.g. 4.0.3 or 4.2)
def convert(x):
    if x is np.nan:
        return x
    else: 
        match = re.search(r"([\d.]*\d)", x)
        if match:
            return match.group()

In [10]:
googlestore["Current Ver"] = googlestore["Current Ver"].apply(convert)


In [11]:
googlestore["Android Ver"] = googlestore["Android Ver"].apply(convert)

In [12]:
#Remove the duplicates

googlestore.drop_duplicates(keep = 'last', inplace = True)

googlestore.Reviews.apply(pd.to_numeric)

googlestore.sort_values(['App', 'Reviews'],ascending=False, inplace = True)

googlestore.drop_duplicates(keep = 'first', inplace = True)


In [13]:
googlestore

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,SizeInBytes
882,🔥 Football Wallpapers 4K | Full HD Backgrounds 😍,ENTERTAINMENT,4.7,11661,4.0M,1000000,Free,0,Everyone,Entertainment,"July 14, 2018",1.1.3.2,4.0.3,4000000.0
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,TOOLS,4.0,19,3.2M,10000,Free,0,Everyone,Tools,"October 21, 2017",1.0,4.2,3200000.0
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",SOCIAL,4.6,22098,18M,1000000,Free,0,Everyone,Social,"July 24, 2018",4.2.4,4.0.3,18000000.0
4362,💎 I'm rich,LIFESTYLE,3.8,718,26M,10000,Paid,$399.99,Everyone,Lifestyle,"March 11, 2018",1.0.0,4.4,26000000.0
6334,"뽕티비 - 개인방송, 인터넷방송, BJ방송",VIDEO_PLAYERS,,414,59M,100000,Free,0,Mature 17+,Video Players & Editors,"July 18, 2018",4.0.7,4.0.3,59000000.0
6420,감성학원 BL 첫사랑,COMICS,4.4,190,34M,10000,Free,0,Everyone,Comics,"November 17, 2017",3.2.1,3.0,34000000.0
9222,英漢字典 EC Dictionary,FAMILY,4.3,55408,,1000000,Free,0,Everyone,Education,"January 8, 2018",,,
313,"漫咖 Comics - Manga,Novel and Stories",COMICS,4.1,12088,21M,1000000,Free,0,Mature 17+,Comics,"July 6, 2018",2.3.1,4.0.3,21000000.0
5698,日本AV历史,BOOKS_AND_REFERENCE,4.1,215,30M,10000,Free,0,Teen,Books & Reference,"March 6, 2018",1.2,4.0,30000000.0
10109,哈哈姆特不EY,COMMUNICATION,,239,18M,10000,Free,0,Everyone,Communication,"July 31, 2018",1.3.4,4.4,18000000.0


In [14]:
#For each category, compute the number of apps
googlestore[["App", "Category"]].groupby(['Category']).count()


Unnamed: 0_level_0,App
Category,Unnamed: 1_level_1
ART_AND_DESIGN,65
AUTO_AND_VEHICLES,85
BEAUTY,53
BOOKS_AND_REFERENCE,230
BUSINESS,427
COMICS,60
COMMUNICATION,366
DATING,196
EDUCATION,130
ENTERTAINMENT,111


In [15]:
#For each category, compute the average rating
googlestore[["Category", "Rating"]].groupby(['Category']).mean()

Unnamed: 0_level_0,Rating
Category,Unnamed: 1_level_1
ART_AND_DESIGN,4.358065
AUTO_AND_VEHICLES,4.190411
BEAUTY,4.278571
BOOKS_AND_REFERENCE,4.347458
BUSINESS,4.102593
COMICS,4.155172
COMMUNICATION,4.151466
DATING,3.971698
EDUCATION,4.375969
ENTERTAINMENT,4.136036


In [16]:
#Create two dataframes: one for the genres and one bridging apps and genres. 
#So that, for instance, the app Pixel Draw - Number Art Coloring Book appears twice in the bridging table, 
#once for Art & Design, once for Creativity

genres = googlestore['Genres'] #database of genres

In [17]:
column = genres.str.split(";", n = 2, expand = True)
column

Unnamed: 0,0,1
882,Entertainment,
7559,Tools,
2575,Social,
4362,Lifestyle,
6334,Video Players & Editors,
6420,Comics,
9222,Education,
313,Comics,
5698,Books & Reference,
10109,Communication,


In [18]:
g = pd.DataFrame()
g["Primo Genere"] = column[0]
g["Secondo Genere"] = column[1]

In [19]:
App = pd.DataFrame()

In [20]:
#dataframe of App
App = googlestore[["App"]]
type(App)

pandas.core.frame.DataFrame

In [21]:
bridging_table = pd.merge(App, g, left_index = True, right_index = True)


In [22]:
bridging_table['Primo Genere'].fillna(value=pd.np.nan, inplace=True)
bridging_table['Secondo Genere'].fillna(value=pd.np.nan, inplace=True)
bridging_table

Unnamed: 0,App,Primo Genere,Secondo Genere
882,🔥 Football Wallpapers 4K | Full HD Backgrounds 😍,Entertainment,
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,Tools,
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",Social,
4362,💎 I'm rich,Lifestyle,
6334,"뽕티비 - 개인방송, 인터넷방송, BJ방송",Video Players & Editors,
6420,감성학원 BL 첫사랑,Comics,
9222,英漢字典 EC Dictionary,Education,
313,"漫咖 Comics - Manga,Novel and Stories",Comics,
5698,日本AV历史,Books & Reference,
10109,哈哈姆特不EY,Communication,


In [23]:
#check if any nan in column 'primo genere'.
bridging_table['Primo Genere'].isnull().values.any()

False

In [24]:
#For each genre, create a new column of the original dataframe. 
#The new columns must have boolean values (True if the app has a given genre)
genre = set(bridging_table['Primo Genere'])
genre2 =  set(bridging_table['Secondo Genere'])

#checking if genre2 is subset of genre
genre2.issubset(genre)

False

In [25]:
#finding what elements are not in set genre
main_list = np.setdiff1d(genre2,genre)
main_list

array([{nan, 'Pretend Play', 'Brain Games', 'Music & Video', 'Education', 'Creativity', 'Action & Adventure'}],
      dtype=object)

In [26]:
# add 'Creativity', 'Action & Adventure', 'Pretend Play', 'Music & Video', 'Brain Games', 'Education' to genre 1
genre.add('Creativity')
genre.add('Action & Adventure')
genre.add('Pretend Play')
genre.add('Music & Video')
genre.add('Brain Games')
genre.add('Education')


In [27]:
#create a column for each row
df_genre = pd.DataFrame()
df_genre = df_genre.assign(**{k: 0 for k in genre})
df_genre


Unnamed: 0,Music & Audio,Educational,Pretend Play,Brain Games,Music & Video,Art & Design,Trivia,Card,Parenting,Simulation,...,Events,Auto & Vehicles,Casual,Health & Fitness,Casino,Creativity,Adventure,News & Magazines,Communication,Racing


In [28]:
#add column app to df_genre

s = bridging_table.set_index('App')[['Primo Genere', 'Secondo Genere']]
df22 = pd.get_dummies(s, prefix_sep="", prefix="", dtype=bool).max(level=0)
df = App.join(df22, on='App').fillna(dict.fromkeys(df22.columns, False))
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Unnamed: 0,App,Action,Adventure,Arcade,Art & Design,Auto & Vehicles,Beauty,Board,Books & Reference,Business,...,Trivia,Video Players & Editors,Weather,Word,Action & Adventure,Brain Games,Creativity,Education,Music & Video,Pretend Play
882,🔥 Football Wallpapers 4K | Full HD Backgrounds 😍,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4362,💎 I'm rich,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6334,"뽕티비 - 개인방송, 인터넷방송, BJ방송",False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
6420,감성학원 BL 첫사랑,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9222,英漢字典 EC Dictionary,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
313,"漫咖 Comics - Manga,Novel and Stories",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5698,日本AV历史,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
10109,哈哈姆特不EY,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [29]:
#For each genre, compute the average rating. What is the genre with highest average?

#googlestore2 = pd.concat([googlestore, bridging_table], axis=1, sort=False) #add previously created columns
#googlestore2

In [30]:
#google_app_3 = app_genres.merge(google_app, on = "App")
googlestore2 = bridging_table.merge(googlestore, on = "App")

In [31]:


googlestore2.groupby('Primo Genere', as_index = False, sort = False)[['Rating']].mean()

Unnamed: 0,Primo Genere,Rating
0,Entertainment,4.121246
1,Tools,4.059922
2,Social,4.259649
3,Lifestyle,4.101911
4,Video Players & Editors,4.096891
5,Comics,4.112121
6,Education,4.330592
7,Books & Reference,4.350256
8,Communication,4.197052
9,Maps & Navigation,4.077941


In [35]:
print("the max value corresponds to the genre " + googlestore2.groupby('Primo Genere')['Rating'].mean().argmax())

the max value corresponds to the genre Word


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.


In [36]:
googlestore

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,SizeInBytes
882,🔥 Football Wallpapers 4K | Full HD Backgrounds 😍,ENTERTAINMENT,4.7,11661,4.0M,1000000,Free,0,Everyone,Entertainment,"July 14, 2018",1.1.3.2,4.0.3,4000000.0
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,TOOLS,4.0,19,3.2M,10000,Free,0,Everyone,Tools,"October 21, 2017",1.0,4.2,3200000.0
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",SOCIAL,4.6,22098,18M,1000000,Free,0,Everyone,Social,"July 24, 2018",4.2.4,4.0.3,18000000.0
4362,💎 I'm rich,LIFESTYLE,3.8,718,26M,10000,Paid,$399.99,Everyone,Lifestyle,"March 11, 2018",1.0.0,4.4,26000000.0
6334,"뽕티비 - 개인방송, 인터넷방송, BJ방송",VIDEO_PLAYERS,,414,59M,100000,Free,0,Mature 17+,Video Players & Editors,"July 18, 2018",4.0.7,4.0.3,59000000.0
6420,감성학원 BL 첫사랑,COMICS,4.4,190,34M,10000,Free,0,Everyone,Comics,"November 17, 2017",3.2.1,3.0,34000000.0
9222,英漢字典 EC Dictionary,FAMILY,4.3,55408,,1000000,Free,0,Everyone,Education,"January 8, 2018",,,
313,"漫咖 Comics - Manga,Novel and Stories",COMICS,4.1,12088,21M,1000000,Free,0,Mature 17+,Comics,"July 6, 2018",2.3.1,4.0.3,21000000.0
5698,日本AV历史,BOOKS_AND_REFERENCE,4.1,215,30M,10000,Free,0,Teen,Books & Reference,"March 6, 2018",1.2,4.0,30000000.0
10109,哈哈姆特不EY,COMMUNICATION,,239,18M,10000,Free,0,Everyone,Communication,"July 31, 2018",1.3.4,4.4,18000000.0


In [67]:
#For each app, compute the approximate income, obtain as a product of number of installs and price
price = pd.DataFrame()
install = pd.DataFrame()
install['Installs'] = googlestore['Installs'].astype(float)


In [68]:
price['Price'] = googlestore['Price'].str.replace(r'$','').astype(float)


In [74]:
income = pd.DataFrame()
income = googlestore


In [85]:
income['Income'] = price['Price']*install['Installs']
income

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,SizeInBytes,Income
882,🔥 Football Wallpapers 4K | Full HD Backgrounds 😍,ENTERTAINMENT,4.7,11661,4.0M,1000000.0,Free,0,Everyone,Entertainment,"July 14, 2018",1.1.3.2,4.0.3,4000000.0,0.0
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,TOOLS,4.0,19,3.2M,10000.0,Free,0,Everyone,Tools,"October 21, 2017",1.0,4.2,3200000.0,0.0
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",SOCIAL,4.6,22098,18M,1000000.0,Free,0,Everyone,Social,"July 24, 2018",4.2.4,4.0.3,18000000.0,0.0
4362,💎 I'm rich,LIFESTYLE,3.8,718,26M,10000.0,Paid,$399.99,Everyone,Lifestyle,"March 11, 2018",1.0.0,4.4,26000000.0,3999900.0
6334,"뽕티비 - 개인방송, 인터넷방송, BJ방송",VIDEO_PLAYERS,,414,59M,100000.0,Free,0,Mature 17+,Video Players & Editors,"July 18, 2018",4.0.7,4.0.3,59000000.0,0.0
6420,감성학원 BL 첫사랑,COMICS,4.4,190,34M,10000.0,Free,0,Everyone,Comics,"November 17, 2017",3.2.1,3.0,34000000.0,0.0
9222,英漢字典 EC Dictionary,FAMILY,4.3,55408,,1000000.0,Free,0,Everyone,Education,"January 8, 2018",,,,0.0
313,"漫咖 Comics - Manga,Novel and Stories",COMICS,4.1,12088,21M,1000000.0,Free,0,Mature 17+,Comics,"July 6, 2018",2.3.1,4.0.3,21000000.0,0.0
5698,日本AV历史,BOOKS_AND_REFERENCE,4.1,215,30M,10000.0,Free,0,Teen,Books & Reference,"March 6, 2018",1.2,4.0,30000000.0,0.0
10109,哈哈姆特不EY,COMMUNICATION,,239,18M,10000.0,Free,0,Everyone,Communication,"July 31, 2018",1.3.4,4.4,18000000.0,0.0


In [86]:
income.loc[4362]

App                   💎 I'm rich
Category               LIFESTYLE
Rating                       3.8
Reviews                      718
Size                         26M
Installs                   10000
Type                        Paid
Price                    $399.99
Content Rating          Everyone
Genres                 Lifestyle
Last Updated      March 11, 2018
Current Ver                1.0.0
Android Ver                  4.4
SizeInBytes              2.6e+07
Income                3.9999e+06
Name: 4362, dtype: object

In [92]:
#For each app, compute its minimum and maximum Sentiment_polarity
polarity.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [95]:
polarity_grouped = polarity.groupby('App')
polarity_grouped.agg({'App' : np.sum, 'Sentiment_Polarity'  : [np.min, np.max]})

Unnamed: 0_level_0,App,Sentiment_Polarity,Sentiment_Polarity
Unnamed: 0_level_1,sum,amin,amax
App,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
10 Best Foods for You,10 Best Foods for You10 Best Foods for You10 B...,-0.800000,1.000000
104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室104 找工作 - 找工作 ...,-0.112500,0.910000
11st,11st11st11st11st11st11st11st11st11st11st11st11...,-1.000000,1.000000
1800 Contacts - Lens Store,1800 Contacts - Lens Store1800 Contacts - Lens...,-0.300000,0.838542
1LINE – One Line with One Touch,1LINE – One Line with One Touch1LINE – One Lin...,-0.825000,1.000000
2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif,2018Emoji Keyboard 😂 Emoticons Lite -sticker&g...,-0.800000,1.000000
21-Day Meditation Experience,21-Day Meditation Experience21-Day Meditation ...,-0.265625,0.587500
"2Date Dating App, Love and matching","2Date Dating App, Love and matching2Date Datin...",-0.645833,1.000000
2GIS: directory & navigator,2GIS: directory & navigator2GIS: directory & n...,-0.375000,1.000000
2RedBeans,2RedBeans2RedBeans2RedBeans2RedBeans2RedBeans2...,-0.800000,1.000000
