# Profitable App Profiles - App Store and Google Play Markets
*Objective* : Analyse the popularity of mobile apps by genre as an exercise to showcase Python knowledge.

# Step 1 (import): Read in Google Play Store and ios App data into Jupyter

In [16]:
android_file = open('/users/caseyhahn/googleplaystore.csv')
ios_file = open('/users/caseyhahn/AppleStore.csv')

from csv import reader
read_android_file = reader(android_file)
android_data = list(read_android_file)

from csv import reader
read_ios_file = reader(ios_file)
ios_data = list(read_ios_file)


In [23]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

# Step 2 (prep): print out column headers to determine which columns will be useful for purpose of the analysis

In [27]:
explore_data(android_data,0,2)
explore_data(ios_data,0,2)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['', 'id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['1', '281656475', 'PAC-MAN Premium', '100788224', 'USD', '3.99', '21292', '26', '4', '4.5', '6.3.5', '4+', 'Games', '38', '5', '10', '1']




useful columns appear to be as follows: 

| Android columns | iOS columns |
| --------------- | ----------- |
| Category | price|
| Ratings | rating_count_tot|
| Reviews | rating_count_ver|
| Installs | user_rating |
| Price | prime_genre|
|Content Rating| 
|Genres|


# Step 3 (data cleanup): From the discussion section of Kaggle, the data has a typo in index 10473. The 'category' field is missing.
https://www.kaggle.com/lava18/google-play-store-apps/discussion/66015

In [35]:
print(android_data[0])
print(android_data[10473])

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [38]:
del(android_data[10473])

# Step 4 (data cleanup): look for duplicate app names in the google play store data set

In [59]:
unique_apps = []
duplicate_apps = []

for app in android_data[1:]:
    if app[0] in unique_apps:
        duplicate_apps.append(app[0])
    else: 
        unique_apps.append(app[0])

print('The number of duplicates is: ',+ len(duplicate_apps))
print(duplicate_apps[0:5])

Join_me_apps = []

print(android_data[0])

for apps in android_data:
    if apps[0] == 'Quick PDF Scanner + OCR FREE':
        print(apps)
        

The number of duplicates is:  1180
['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80804', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']


# Step 5 (remove duplicate records): as shown in the last code cell output there are 1,180 duplicate records in the Google Play Store data. The duplicate records all vary only by the number of reviews. For example, the app "Quick PDF Scanner + OCR FREE" appears in 3 seperate rows and the only difference between rows are the review counts. The below code iterates through the android data and for any records with duplicates, only retains the record with the most reviews. 


In [67]:
app_max_review = {}

for apps in android_data[1:]:
    app_name = apps[0]
    app_review_count = float(apps[3])
    if (app_name in app_max_review):
        if (app_review_count > app_max_review[app_name]):
            app_max_review[app_name] = app_review_count
    else: 
        app_max_review[app_name] = app_review_count

print(len(app_max_review))

android_data_clean = []
android_already_added = []

for app in android_data[1:]:
    number_reviews = float(app[3])
    name = app[0]
    if ((name not in android_already_added) and (number_reviews == app_max_review[name])):
        android_data_clean.append(app)
        android_already_added.append(name)

print(len(android_data_clean))
print(android_data_clean[0:4])
        


9659
9659
[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']]


# Step 6(remove non-english apps) 

In [80]:
def check_english_name(input_name):
    count = 0
    for character in input_name:
        if ord(character) >127:
            count += 1
    if count > 3:
        return False
    
    return True

check_english_name('Instagram')
check_english_name('爱奇艺PPS -《欢乐颂2》电视剧热播')
check_english_name('Instachat 😜')


True

In [91]:
android_data_clean_english = []

for app in android_data_clean:
    name = app[0]
    if (check_english_name(name)):
        android_data_clean_english.append(app)
        
print(android_data_clean_english[0:4])
len(android_data_clean_english)
        


[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']]


9614

results from step 6: removed 46 non-english mobile apps from the data => 9659 - 9614 = 47. 

In [99]:
android_data_clean_english_free = []
android_data_clean_english_notFree = []


print(android_data_clean_english[0][6])

for app in android_data_clean_english:
    Free_Indicator = app[6]
    if (Free_Indicator == 'Free'):
        android_data_clean_english_free.append(app)
        
for app in android_data_clean_english:
    Free_Indicator = app[6]
    if (Free_Indicator != 'Free'):
        android_data_clean_english_notFree.append(app)
        
print(len(android_data_clean_english_free))
print(len(android_data_clean_english_notFree))

    

Free
8863
751


# Step 7: begin to analyze frequency tables for popular app genres for both ios and android.

In [141]:
def freq_table(data_set, index):
    ft_dictionary = {}
    for each_row in data_set: #includes header row
        value = each_row[index] 
        if value in ft_dictionary:
            ft_dictionary[value] += 1 #add 1 to dictionary value if already added
        else:
            ft_dictionary[value] = 1 #if not already added set dictionary value to 1
    
    for each_key in ft_dictionary:
        ft_dictionary[each_key] = round(ft_dictionary[each_key]/len(data_set)*100,2)
    
    return ft_dictionary

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

display_table(android_data_clean_english_free, 9) #Genre
print("The percentage of fun related apps are:")
print(6.07+3.46+2.94+2.66+1.86+1.85+1.77+1.76+1.13+.99+0.94) #Entertainment, sports, photography, social, dating, arcarde, video players, casual
print("The percentage of practical related apps are:")
print(8.45+5.35+4.59+3.89+3.89+3.53+3.7+3.08)

Tools : 8.45
Entertainment : 6.07
Education : 5.35
Business : 4.59
Productivity : 3.89
Lifestyle : 3.89
Finance : 3.7
Medical : 3.53
Sports : 3.46
Personalization : 3.32
Communication : 3.24
Action : 3.1
Health & Fitness : 3.08
Photography : 2.94
News & Magazines : 2.8
Social : 2.66
Travel & Local : 2.32
Shopping : 2.25
Books & Reference : 2.14
Simulation : 2.04
Dating : 1.86
Arcade : 1.85
Video Players & Editors : 1.77
Casual : 1.76
Maps & Navigation : 1.4
Food & Drink : 1.24
Puzzle : 1.13
Racing : 0.99
Role Playing : 0.94
Libraries & Demo : 0.94
Auto & Vehicles : 0.93
Strategy : 0.9
House & Home : 0.82
Weather : 0.8
Events : 0.71
Adventure : 0.68
Comics : 0.61
Beauty : 0.6
Art & Design : 0.6
Parenting : 0.5
Card : 0.45
Casino : 0.43
Trivia : 0.42
Educational;Education : 0.39
Board : 0.38
Educational : 0.37
Education;Education : 0.34
Word : 0.26
Casual;Pretend Play : 0.24
Music : 0.2
Racing;Action & Adventure : 0.17
Puzzle;Brain Games : 0.17
Entertainment;Music & Video : 0.17
Casual;B

In [108]:
display_table(android_data_clean_english_free, 1) #Category



FAMILY : 18.9
GAME : 9.73
TOOLS : 8.46
BUSINESS : 4.59
LIFESTYLE : 3.9
PRODUCTIVITY : 3.89
FINANCE : 3.7
MEDICAL : 3.53
SPORTS : 3.4
PERSONALIZATION : 3.32
COMMUNICATION : 3.24
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.94
NEWS_AND_MAGAZINES : 2.8
SOCIAL : 2.66
TRAVEL_AND_LOCAL : 2.34
SHOPPING : 2.25
BOOKS_AND_REFERENCE : 2.14
DATING : 1.86
VIDEO_PLAYERS : 1.79
MAPS_AND_NAVIGATION : 1.4
FOOD_AND_DRINK : 1.24
EDUCATION : 1.16
ENTERTAINMENT : 0.96
LIBRARIES_AND_DEMO : 0.94
AUTO_AND_VEHICLES : 0.93
HOUSE_AND_HOME : 0.82
WEATHER : 0.8
EVENTS : 0.71
PARENTING : 0.65
ART_AND_DESIGN : 0.64
COMICS : 0.62
BEAUTY : 0.6


## Step 8: Analysis of Google Play data 
 
 (Genre)
- The most common genre is "Tools" with 8.45% proportion of all free,english Android apps in the app market
- Social network apps make up only 2.66% of all Android apps 
- roughly 25% of apps are designed for fun (games, entertainment, photo and video, social networking, sports, music)
- roughly 40% of apps are designed for practical purposes( productivity, tools, business, education)

(Category)
- The most common Category is "Family" with 18.9% proportion of all free,english Andriod apps in the app market
- Social network apps make up only 2.66% of all Android apps 
- roughly 25% of apps are designed for fun (games, entertainment, photo and video, social networking, sports, music)
- roughly 40% of apps are designed for practical purposes( productivity, tools, business, education)

**Based on raw frequency of apps in the ios market it is recommended to develop an app designed for productivity usage such as a Tool. However, consider looking at the number of users in addition to raw frequency to build a better story.**

In [138]:
ios_data_free_english = []
for app in ios_data[1:]:
    price = float(app[5])
    name = app[2]
    if price == 0:
        if check_english_name(name):
            ios_data_free_english.append(app)
            
display_table(ios_data_free_english,12) #prime_genre
print('Total percent of apps related to fun: ') 
print(58.16+7.88+4.97+3.29+2.14+2.05)


Games : 58.16
Entertainment : 7.88
Photo & Video : 4.97
Education : 3.66
Social Networking : 3.29
Shopping : 2.61
Utilities : 2.51
Sports : 2.14
Music : 2.05
Health & Fitness : 2.02
Productivity : 1.74
Lifestyle : 1.58
News : 1.33
Travel : 1.24
Finance : 1.12
Weather : 0.87
Food & Drink : 0.81
Reference : 0.56
Business : 0.53
Book : 0.43
Navigation : 0.19
Medical : 0.19
Catalogs : 0.12
Total percent of apps related to fun: 
78.49




# Step 8: Analysis of ios Data prime-genre
- The most common genre is "Games" with 58.16% proportion of all free,english ios apps in the app market
- Social network apps make up only 3.29% of all ios apps 
- 78.49% of apps are designed for fun (games, entertainment, photo and video, social networking, sports, music) 

**Based on raw frequency of apps in the ios market it is recommended to develop an app designed for fun/recreation usage such as a game. However, consider looking at the number of users in addition to raw frequency to build a better story.** 

# Step 9: Analysis of average installs / user reviews for ios and Android markets: 
- ios data: use "rating_count_tot" (index = 6)
- android data: use "installs" (index = 5) 

In [159]:
ios_rating_ft = freq_table(ios_data_free_english,12)


prime_genre_average_reviews = {}

for each_genre in ios_rating_ft: #iterate over dictionary ios_rating_ft
    prime_genre_count = 0
    prime_genre_reviews = 0
    genre = each_genre
    for each_row in ios_data_free_english: #iterate over the ios data set 
        if genre == each_row[12]: #prime_genre 
            prime_genre_count += 1
            prime_genre_reviews += float(each_row[6]) #rating_count_tot
    prime_genre_average_reviews[genre] = (prime_genre_reviews / prime_genre_count)
            
table = prime_genre_average_reviews

table_display = []
for key in table:
    key_val_as_tuple = (table[key], key)
    table_display.append(key_val_as_tuple)

table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
    print(entry[1], ':', entry[0])


Navigation : 86090.33333333333
Reference : 74942.11111111111
Social Networking : 71548.34905660378
Music : 57326.530303030304
Weather : 52279.892857142855
Book : 39758.5
Food & Drink : 33333.92307692308
Finance : 31467.944444444445
Photo & Video : 28441.54375
Travel : 28243.8
Shopping : 26919.690476190477
Health & Fitness : 23298.015384615384
Sports : 23008.898550724636
Games : 22788.6696905016
News : 21248.023255813954
Productivity : 21028.410714285714
Utilities : 18684.456790123455
Lifestyle : 16485.764705882353
Entertainment : 14029.830708661417
Business : 7491.117647058823
Education : 7003.983050847458
Catalogs : 4004.0
Medical : 612.0


# Step 9: ios Data average number of reviews by Prime_genre
- the most reviewed category is Navigation with 86,090 reviews per app
- the second most reviewed category is Reference with 74,942 reviews per app
- the third  most reviewed category is Social Networking with 71,548 reviews per app
- Games are the most common app based on raw frequency but average reviews are only 22,788

Based on the average number of reviews recommend building a Social Network or Reference app. 

In [153]:
Android_rating_ft = freq_table(android_data_clean_english_free,5) #installs

Android_rating_ft

{'10,000+': 10.2,
 '5,000,000+': 6.83,
 '50,000,000+': 2.3,
 '100,000+': 11.55,
 '50,000+': 4.77,
 '1,000,000+': 15.73,
 '10,000,000+': 10.55,
 '5,000+': 4.51,
 '500,000+': 5.56,
 '1,000,000,000+': 0.23,
 '100,000,000+': 2.13,
 '1,000+': 8.39,
 '500,000,000+': 0.27,
 '500+': 3.25,
 '100+': 6.92,
 '50+': 1.92,
 '10+': 3.54,
 '1+': 0.51,
 '5+': 0.79,
 '0+': 0.05}

In [161]:
Android_rating_ft = freq_table(android_data_clean_english_free,9) #genre

prime_genre_average_reviews_android = {}

for each_genre in Android_rating_ft: #iterate over dictionary Android_rating_ft
    prime_genre_count = 0
    prime_genre_reviews = 0
    genre = each_genre
    for each_row in android_data_clean_english_free: #iterate over the Android data set 
        if genre == each_row[9]: #genre 
            Installs = each_row[5].replace('+',"")
            Installs2 = Installs.replace(',',"")                               
            prime_genre_count += 1
            prime_genre_reviews += float(Installs2) #Installs
    prime_genre_average_reviews_android[genre] = (prime_genre_reviews / prime_genre_count)
            
table = prime_genre_average_reviews_android

table_display = []
for key in table:
    key_val_as_tuple = (table[key], key)
    table_display.append(key_val_as_tuple)

table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
    print(entry[1], ':', entry[0])
    
for each_row in android_data_clean_english_free:
    name = each_row[0]
    if each_row[9] == 'Communication':
        print (name)

        
        

Communication : 38456119.167247385
Adventure;Action & Adventure : 35333333.333333336
Video Players & Editors : 24947335.796178345
Social : 23253652.127118643
Arcade : 22888365.48780488
Casual : 19569221.602564104
Puzzle;Action & Adventure : 18366666.666666668
Photography : 17840110.40229885
Educational;Action & Adventure : 17016666.666666668
Productivity : 16787331.344927534
Racing : 15910645.681818182
Travel & Local : 14051476.145631067
Casual;Action & Adventure : 12916666.666666666
Action : 12603588.872727273
Strategy : 11339901.3125
Tools : 10802461.246995995
Tools;Education : 10000000.0
Role Playing;Brain Games : 10000000.0
Lifestyle;Pretend Play : 10000000.0
Casual;Music & Video : 10000000.0
Card;Action & Adventure : 10000000.0
Adventure;Education : 10000000.0
News & Magazines : 9549178.467741935
Music : 9445583.333333334
Educational;Pretend Play : 9375000.0
Puzzle;Brain Games : 9280666.666666666
Word : 9094458.695652174
Racing;Action & Adventure : 8816666.666666666
Books & Refere

# step 9: Android average number of installs by Genre

- Most average installs is communication with 38,456,119 installs. Gmail and WhatsApp are included in this genre with over 1,000,000,000 installs. 
- Action & Adventure is the second most installs with 35,333,333 installs. 

Based on the average number of installs recommend developping a Communications app or a action/adventure game app.

# Step 10: Overall recommendation for App development

In order to determine which mobile apps will likely be successfull on both the Android and ios Marketplaces we summarize the app saturation and popularity statistics from above. 

Summary of ios App: 

| Ranking - app saturation | Ranking - number of reviews |
| ----- | -----| 
| Games : 58.16 |Navigation : 86,090 |
| Entertainment : 7.88 | Reference : 74,942|
| Photo & Video : 4.97 | Social Networking : 71,548 |
| Education : 3.66 | Music : 57,326 |
| Social Networking : 3.29 | Weather : 52,279 |

Summary of Android App: 

| Ranking - app saturation | Ranking - number of installs |
| ----- | -----| 
| FAMILY : 18.9 | Communication : 38,456,119 |
| GAME : 9.73 | Adventure;Action & Adventure : 35,333,333 |
| TOOLS : 8.46 | Video Players & Editors : 24,947,335 |
| BUSINESS : 4.59 | Social : 23,253,652 |
| LIFESTYLE : 3.9 | Arcade : 22,888,365 |


>**Recommendation: Action/Adventure Game would be the ideal app to develop for both android and ios. Games are the #1 and #2 most saturated app on the ios and Android marketplaces as well as 3 of the top 5 most installed on the Android market. Action/Adventure is the second most installed app on the Android market.**







