In [89]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as st
import re
import os
import pandas_profiling
import this

In [2]:
# Versions
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("re version:", re.__version__)

pandas version: 0.25.1
numpy version: 1.17.2
re version: 2.2.1


In [3]:
# Directories & Files
os.listdir()

# Datasets directory
directory = "./app-store-apple-data-set-10k-apps/"

In [5]:
# creating the DataFrames dynamically
# 1st step: store the names and filenames of the files as a key-value pair in a dictionary
datasets = {f"{re.sub('.csv', '', filename.lower())}": filename for filename in os.listdir(directory)}

# 2nd step: for each key in the datasets dictionary, create a DF
for name in datasets:
    print(name)
    globals()[name] = pd.read_csv(directory + datasets[name])

applestore
applestore_description


# Data Cleaning

In [10]:
applestore.head()
applestore.tail()

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
7192,11081,1187617475,Kubik,126644224,USD,0.0,142,75,4.5,4.5,1.3,4+,Games,38,5,1,1
7193,11082,1187682390,VR Roller-Coaster,120760320,USD,0.0,30,30,4.5,4.5,0.9,4+,Games,38,0,1,1
7194,11087,1187779532,Bret Michaels Emojis + Lyric Keyboard,111322112,USD,1.99,15,0,4.5,0.0,1.0.2,9+,Utilities,37,1,1,1
7195,11089,1187838770,VR Roller Coaster World - Virtual Reality,97235968,USD,0.0,85,32,4.5,4.5,1.0.15,12+,Games,38,0,2,1
7196,11097,1188375727,Escape the Sweet Shop Series,90898432,USD,0.0,3,3,5.0,5.0,1.0,4+,Games,40,0,2,1


In [13]:
applestore_description.head()
applestore_description.tail()

Unnamed: 0,id,track_name,size_bytes,app_desc
7192,1187617475,Kubik,126644224,Place the falling blocks correctly in order to...
7193,1187682390,VR Roller-Coaster,120760320,A thrilling virtual reality roller coaster exp...
7194,1187779532,Bret Michaels Emojis + Lyric Keyboard,111322112,"Rock star Bret Michaels, winner of Celebrity A..."
7195,1187838770,VR Roller Coaster World - Virtual Reality,97235968,VR Roller Coaster World is an app for Google C...
7196,1188375727,Escape the Sweet Shop Series,90898432,5 previous escape games plus 1 new game in one...


In [17]:
applestore.shape
applestore_description.shape

(7197, 4)

In [19]:
applestore.info()
applestore_description.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7197 entries, 0 to 7196
Data columns (total 17 columns):
Unnamed: 0          7197 non-null int64
id                  7197 non-null int64
track_name          7197 non-null object
size_bytes          7197 non-null int64
currency            7197 non-null object
price               7197 non-null float64
rating_count_tot    7197 non-null int64
rating_count_ver    7197 non-null int64
user_rating         7197 non-null float64
user_rating_ver     7197 non-null float64
ver                 7197 non-null object
cont_rating         7197 non-null object
prime_genre         7197 non-null object
sup_devices.num     7197 non-null int64
ipadSc_urls.num     7197 non-null int64
lang.num            7197 non-null int64
vpp_lic             7197 non-null int64
dtypes: float64(3), int64(9), object(5)
memory usage: 956.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7197 entries, 0 to 7196
Data columns (total 4 columns):
id            7197 non-null int6

In [22]:
applestore.index.is_unique
applestore_description.index.is_unique

True

In [53]:
applestore["currency"].unique()

array(['USD'], dtype=object)

In [59]:
# renaming price to price_usd so that I may drop the currency column
applestore.rename(columns = {"price": "price_usd"}, inplace = True)
applestore

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price_usd,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,2,281796108,Evernote - stay organized,158578688,USD,0.00,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.00,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.00,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,5,282935706,Bible,92774400,USD,0.00,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7192,11081,1187617475,Kubik,126644224,USD,0.00,142,75,4.5,4.5,1.3,4+,Games,38,5,1,1
7193,11082,1187682390,VR Roller-Coaster,120760320,USD,0.00,30,30,4.5,4.5,0.9,4+,Games,38,0,1,1
7194,11087,1187779532,Bret Michaels Emojis + Lyric Keyboard,111322112,USD,1.99,15,0,4.5,0.0,1.0.2,9+,Utilities,37,1,1,1
7195,11089,1187838770,VR Roller Coaster World - Virtual Reality,97235968,USD,0.00,85,32,4.5,4.5,1.0.15,12+,Games,38,0,2,1


In [63]:
# dropping unnecessary columns 
applestore.drop(labels = ["Unnamed: 0", "currency"], axis = "columns", inplace = True)
applestore

Unnamed: 0,id,track_name,size_bytes,price_usd,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,281656475,PAC-MAN Premium,100788224,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,281796108,Evernote - stay organized,158578688,0.00,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,0.00,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,0.00,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,282935706,Bible,92774400,0.00,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7192,1187617475,Kubik,126644224,0.00,142,75,4.5,4.5,1.3,4+,Games,38,5,1,1
7193,1187682390,VR Roller-Coaster,120760320,0.00,30,30,4.5,4.5,0.9,4+,Games,38,0,1,1
7194,1187779532,Bret Michaels Emojis + Lyric Keyboard,111322112,1.99,15,0,4.5,0.0,1.0.2,9+,Utilities,37,1,1,1
7195,1187838770,VR Roller Coaster World - Virtual Reality,97235968,0.00,85,32,4.5,4.5,1.0.15,12+,Games,38,0,2,1


In [65]:
applestore["id"].is_unique
applestore_description["id"].is_unique

True

In [73]:
# merging both datasets on id.
apps = applestore.merge(applestore_description[["id", "app_desc"]], on = "id", how = "inner", 
                        validate = "one_to_one")

apps.shape
apps.tail()

Unnamed: 0,id,track_name,size_bytes,price_usd,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic,app_desc
7192,1187617475,Kubik,126644224,0.0,142,75,4.5,4.5,1.3,4+,Games,38,5,1,1,Place the falling blocks correctly in order to...
7193,1187682390,VR Roller-Coaster,120760320,0.0,30,30,4.5,4.5,0.9,4+,Games,38,0,1,1,A thrilling virtual reality roller coaster exp...
7194,1187779532,Bret Michaels Emojis + Lyric Keyboard,111322112,1.99,15,0,4.5,0.0,1.0.2,9+,Utilities,37,1,1,1,"Rock star Bret Michaels, winner of Celebrity A..."
7195,1187838770,VR Roller Coaster World - Virtual Reality,97235968,0.0,85,32,4.5,4.5,1.0.15,12+,Games,38,0,2,1,VR Roller Coaster World is an app for Google C...
7196,1188375727,Escape the Sweet Shop Series,90898432,0.0,3,3,5.0,5.0,1.0,4+,Games,40,0,2,1,5 previous escape games plus 1 new game in one...


In [77]:
# setting the id as the index
apps.set_index("id", inplace = True)

In [82]:
apps.index.is_unique

True

## Questions:
### 1) Is there a relationship between the price and the user rating?
#### 2) Is there a relationship between genre and price?
#### 3) Can we infer the genre of an app by its price (or vice-versa)?

In [85]:
apps.head()

Unnamed: 0_level_0,track_name,size_bytes,price_usd,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic,app_desc,title_len,desc_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
281656475,PAC-MAN Premium,100788224,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1,"SAVE 20%, now only $3.99 for a limited time!\n...",15,1533
281796108,Evernote - stay organized,158578688,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1,Let Evernote change the way you organize your ...,25,3952
281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1,Download the most popular free weather app pow...,47,2090
282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1,The eBay app is the best way to find anything ...,50,3997
282935706,Bible,92774400,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1,On more than 250 million devices around the wo...,5,2998


In [86]:
# saving the new dataset
apps.to_csv(directory + './complete_appstore.csv')

In [88]:
os.listdir("./app-store-apple-data-set-10k-apps/")

['AppleStore.csv', 'complete_appstore.csv', 'appleStore_description.csv']

In [90]:
pandas_profiling.ProfileReport(apps)

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …

