# Data set from Kaggle 
https://www.kaggle.com/lava18/google-play-store-apps/downloads/google-play-store-apps.zip/6

# Context
While many public datasets (on Kaggle and the like) provide Apple App Store data, there are not many counterpart datasets available for Google Play Store apps anywhere on the web. On digging deeper, I found out that iTunes App Store page deploys a nicely indexed appendix-like structure to allow for simple and easy web scraping. On the other hand, Google Play Store uses sophisticated modern-day techniques (like dynamic page load) using JQuery making scraping more challenging.
Content
Each app (row) has values for catergory, rating, size, and more.

In [29]:
import math
import pdb
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from scipy import stats
from scipy.stats import t, sem
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt

In [30]:
# Hide deprecated warnings
import warnings
warnings.filterwarnings('ignore')

In [31]:
# Load data from the folder where I uploaded the csv file
data1 = pd.read_csv('/Users/AirMorena/Desktop/final_proj/csv/googleplaystore.csv')
data2 = pd.read_csv('/Users/AirMorena/Desktop/final_proj/csv/googleplaystore_user_reviews.csv')

In [32]:
data1.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [33]:
len(data1['App'].unique())

9660

In [34]:
data2.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [35]:
#how many unique apps in data2 text reviews?
len(data2['App'].unique())

1074

In [36]:
#how many unique apps in data2 text reviews?
len(data1['App'].unique())

9660

In [37]:
# extract the unique apps from data2 a
apps_revi= data2['App'].unique()

In [38]:
# create list_apps: numpy array
apps_revi

array(['10 Best Foods for You', '104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室',
       '11st', ..., 'Hotwire Hotel & Car Rental App',
       'Housing-Real Estate & Property', 'Houzz Interior Design Ideas'],
      dtype=object)

In [39]:
len(apps_revi)

1074

In [40]:
# create a list from numpy array
apps_revi2=apps_revi.tolist()

In [41]:
# use isin method to create a new data_clean only based on the variable App from data2
data3=data1[data1['App'].isin(apps_revi2)]

In [18]:
data3.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33M,"1,000,000+",Free,0,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up
14,3D Color Pixel by Number - Sandbox Art Coloring,ART_AND_DESIGN,4.4,1518,37M,"100,000+",Free,0,Everyone,Art & Design,"August 3, 2018",1.2.3,2.3 and up
18,FlipaClip - Cartoon animation,ART_AND_DESIGN,4.3,194216,39M,"5,000,000+",Free,0,Everyone,Art & Design,"August 3, 2018",2.2.5,4.0.3 and up
21,Boys Photo Editor - Six Pack & Men's Suit,ART_AND_DESIGN,4.1,654,12M,"100,000+",Free,0,Everyone,Art & Design,"March 20, 2018",1.1,4.0.3 and up


In [19]:
# dataset with the unique apps from data2 but in dataframe data1
data3.shape

(1532, 13)

In [23]:
len(data3['App'].unique())

1020

In [20]:
# export data_clean
data3.to_csv("/Users/AirMorena/Desktop/final_proj/csv/data3.csv", index=False, encoding='utf-8')

In [65]:
# merge data3 with data1 based on the column App, in the outer
data_clean=pd.merge(data1, data2, on='App', how='inner')

In [66]:
#data_clean.head()

In [67]:
# we have 17 variables and 125401 reviews
data_clean.shape

(122662, 17)

In [68]:
# number of unique apps in data1
len(data1['App'].unique())

9660

In [69]:
# number of unique apps in data2 text reviews
len(data2['App'].unique())

1074

In [70]:
# data3 is the result of isin operation
len(data3['App'].unique())

1020

In [71]:
# after the merge of data3 and data2: we lost 54 App by doing the inner merge between data1 Apps and data2 Apps reviews
len(data_clean['App'].unique())

1020

In [72]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122662 entries, 0 to 122661
Data columns (total 17 columns):
App                       122662 non-null object
Category                  122662 non-null object
Rating                    122622 non-null float64
Reviews                   122662 non-null object
Size                      122662 non-null object
Installs                  122662 non-null object
Type                      122662 non-null object
Price                     122662 non-null object
Content Rating            122662 non-null object
Genres                    122662 non-null object
Last Updated              122662 non-null object
Current Ver               122662 non-null object
Android Ver               122662 non-null object
Translated_Review         72605 non-null object
Sentiment                 72615 non-null object
Sentiment_Polarity        72615 non-null float64
Sentiment_Subjectivity    72615 non-null float64
dtypes: float64(3), object(14)
memory usage: 16.8+ MB


In [73]:
data_clean.isnull().sum()

App                           0
Category                      0
Rating                       40
Reviews                       0
Size                          0
Installs                      0
Type                          0
Price                         0
Content Rating                0
Genres                        0
Last Updated                  0
Current Ver                   0
Android Ver                   0
Translated_Review         50057
Sentiment                 50047
Sentiment_Polarity        50047
Sentiment_Subjectivity    50047
dtype: int64

In [74]:
data_clean['Translated_Review'].unique()

array(["A kid's excessive ads. The types ads allowed app, let alone kids",
       'It bad >:(', 'like', ...,
       'Recommended, 100% love it, keep good work dev :) :*',
       'Just allow time ...', "It's good best gallery phone"],
      dtype=object)

In [75]:
# how many unique reviews ?27995 unique translated reviews
len(data_clean['Translated_Review'].unique())

26683

In [76]:
# how many unique app 
len(data_clean['App'].unique())

1020

In [78]:
# export data_clean
data_clean.to_csv("/Users/AirMorena/Desktop/final_proj/csv/data_clean.csv", index=False, encoding='utf-8')