#### DATA SCIENCE PROJECT - @devalissonalves

##### Profitable App Profiles for the App Store and Google Play Markets 

In [42]:
def explore_data(dataset, start, end, rows_and_columns = False):
    
    dataset_slice = dataset[start:end]    
    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [43]:
# OPEN DATASETS (APPSTORE AND GOOGLEPLAY)
from csv import reader

#APPLE DATASET
opened_file_apple = open('./dataset/AppleStore.csv')
read_file_apple = reader(opened_file_apple)
apps_data_apple = list(read_file_apple)
header_apple = apps_data_apple[0]

# GOOGLE DATASET
opened_file_google = open('./dataset/googleplaystore.csv')
read_file_google = reader(opened_file_google)
apps_data_google = list(read_file_google)
header_google = apps_data_google[0]

explore_data(apps_data_apple[1:], 0, 2, True)
explore_data(apps_data_google[1:], 0, 2, True)

print(header_apple)
print(len(header_google))


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7197
Number of columns: 16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
13


In [44]:
# Deleting Wrong Data

for row in apps_data_google[1:]:
    if len(row) != len(header_google):
        print(row)
        print('\n')
        print("Index position is:", apps_data_google.index(row))
        
        # deleting index with missing data
        index = apps_data_google.index(row)
        del apps_data_google[index]
        print(f'Index {index} has been deleted!')


['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


Index position is: 10473
Index 10473 has been deleted!


In [45]:
# Removing Duplicate Entries: Part One

# first moment - Identifying some duplicate applications
# In this case, for example, we found 4 pieces of evidence from the Facebook application that were possibly collected at different times as they contain different evaluation numbers.

# Finding the total number of repeated applications.

duplicate_apps = []
unique_apps = []

for app in apps_data_google:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

print('Number of duplicate apps: ', len(duplicate_apps))
print('Examples of duplicate apps: ', duplicate_apps[:10])

Number of duplicate apps:  1181
Examples of duplicate apps:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


In [46]:
# From this point on, the strategy will be to leave in the dataset only the instance of applications that contain the highest number of evaluations.


# PART 1 - Create a dictionary where each key is a unique app name and the corresponding dictionary value is the highest number of reviews of that app

reviews_max = {}

for app in apps_data_google[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews


# PART 2 - Use the dictionary you created above to remove the duplicate rows:

data_google_clean = []
already_added = []

for app in apps_data_google[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        data_google_clean.append(app)
        already_added.append(name)



In [47]:
# Removing Non-English Apps: Part One

def string_analysis(name_app):
    count_word = 0
    for word in name_app:
        if ord(word) > 127:
            count_word += 1

    if(count_word > 3):
        return False
    else:
        return True

english_apps = []
non_english_apps = []
google_apps_english = []

for app in data_google_clean:
    name = app[0]
    if string_analysis(name):
        english_apps.append(name)
        google_apps_english.append(app)
    else:
        non_english_apps.append(name)

print(len(english_apps))
print('\n')
print(len(non_english_apps))
print("\n")
print(len(google_apps_english))

9614


45


9614


In [48]:
# Isolating the Free Apps

free_apps_apple = []
free_apps_google = []

for app in apps_data_apple[1:]:
    if app[4] == '0' or app[4] == '0.0':
        free_apps_apple.append(app)


for app in google_apps_english:
    if app[7] == '0' or app[7] == '0.0':
        free_apps_google.append(app)

print(len(free_apps_apple))
print(len(free_apps_google))

4056
8864


In [49]:
# Most Common Apps by Genre: Part One

# Goal: The idea is to design applications that are well received by both audiences (iOS and Google). Therefore, the strategy is to identify which application genres are most common in both markets.

# It was observed that gender columns from both sets can be observed in the datasets. On Google index 9 and 1 and on Apple index 11

# index 11
for app in header_apple:
    print(app)

print('\n\n')

#index 9 e index 1 - Category and genres
for app in header_google:
    print(app)

id
track_name
size_bytes
currency
price
rating_count_tot
rating_count_ver
user_rating
user_rating_ver
ver
cont_rating
prime_genre
sup_devices.num
ipadSc_urls.num
lang.num
vpp_lic



App
Category
Rating
Reviews
Size
Installs
Type
Price
Content Rating
Genres
Last Updated
Current Ver
Android Ver


In [50]:
# Most Common Apps by Genre: Part Two

def freq_table(dataset, index):
    freq_table_dict = {}
    total = 0

    for app in dataset:
        # Count the total number of applications read in the dataset
        total += 1

        # Structure that creates frequency table logic
        if app[index] in freq_table_dict:
            freq_table_dict[app[index]] += 1
        else:
            freq_table_dict[app[index]] = 1

    # based on the sum of the total number of applications, a new table containing the percentages is generated.
    table_percentages = {}
    for key in freq_table_dict:
        percentage = (freq_table_dict[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

# isplay_table(free_apps_google, 9)
# display_table(free_apps_google, 1)
# display_table(free_apps_apple[1:], 11)

In [51]:
# Most Common Apps by Genre: Part Three

# - Analisando o dataset do Google Play, podemos observar os aplicatos de ferramentas como principal app mais comum. Seguido de aplicativos para entretenimento e educação. 

# - Analisando o dataset da Apple, podemos observar um expressivo percentual (+55%) para aplicativos do gênero Games, sendo seguido pelo gênero de entretenimento. 

# - Dentre os principais 10 aplicativos do Google Play e da Apple Store em comum, podemos descatar uma tendência de utilização para aplicativos de entretenimento, educação, estilo de vida, finaças e esportes. 



# Analise a tabela de frequência gerada para a coluna Categorye Genresdo conjunto de dados do Google Play

# - Podemos observar que os gêneros mais comuns são aplicativos voltados para a Família, Games, Ferramentas, Negócios e Estilo de vida
# -  Dentre os principais 10 aplicativos do Google Play e da Apple Store em comum, podemos descatar uma tendência de utilização para aplicativos de entretenimento, educação, estilo de vida, finaças e esportes. 

# - Para saber a quantidade de usuários por gênero precisamos fazer novas análises.


In [52]:
display_table(free_apps_google, 1)

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

In [53]:
display_table(free_apps_google, 9)

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

In [54]:
# Most Popular Apps by Genre on the App Store

prime_genre_dict = freq_table(free_apps_apple, -5) 

for genre in prime_genre_dict:
    total = 0
    len_genre = 0
    for app in free_apps_apple:
        genre_app = app[-5]
        if genre_app == genre:
            n_ratings = float(app[5])
            total += n_ratings
            len_genre += 1
    avg_n_ratings = total / len_genre
    print(genre, ':', avg_n_ratings)
    

Social Networking : 53078.195804195806
Photo & Video : 27249.892215568863
Games : 18924.68896765618
Music : 56482.02985074627
Reference : 67447.9
Health & Fitness : 19952.315789473683
Weather : 47220.93548387097
Utilities : 14010.100917431193
Travel : 20216.01785714286
Shopping : 18746.677685950413
News : 15892.724137931034
Navigation : 25972.05
Lifestyle : 8978.308510638299
Entertainment : 10822.961077844311
Food & Drink : 20179.093023255813
Sports : 20128.974683544304
Book : 8498.333333333334
Finance : 13522.261904761905
Education : 6266.333333333333
Productivity : 19053.887096774193
Business : 6367.8
Catalogs : 1779.5555555555557
Medical : 459.75


In [55]:
# Most Popular Apps by Genre on Google Play

display_table(free_apps_google, 5)
categories_android = freq_table(data_google_clean, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in data_google_clean:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)


1,000,000+ : 15.726534296028879
100,000+ : 11.552346570397113
10,000,000+ : 10.548285198555957
10,000+ : 10.198555956678701
1,000+ : 8.393501805054152
100+ : 6.915613718411552
5,000,000+ : 6.825361010830325
500,000+ : 5.561823104693141
50,000+ : 4.7721119133574
5,000+ : 4.512635379061372
10+ : 3.5424187725631766
500+ : 3.2490974729241873
50,000,000+ : 2.3014440433213
100,000,000+ : 2.1322202166064983
50+ : 1.917870036101083
5+ : 0.78971119133574
1+ : 0.5076714801444043
500,000,000+ : 0.2707581227436823
1,000,000,000+ : 0.22563176895306858
0+ : 0.04512635379061372
0 : 0.01128158844765343
ART_AND_DESIGN : 1856362.2950819673
AUTO_AND_VEHICLES : 625061.305882353
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 7504367.459459459
BUSINESS : 1659916.3452380951
COMICS : 803234.8214285715
COMMUNICATION : 35042146.82857143
DATING : 828971.2176470588
EDUCATION : 1775252.3364485982
ENTERTAINMENT : 11375402.298850575
EVENTS : 249580.640625
FINANCE : 1319851.4028985507
FOOD_AND_DRINK : 1891060.2767