
## Profitable App Profiles for the App Store and Play Store

- The project is about analyzing free apps in the App Store and Play Store
- Find out which kind of apps bring most of the revenue for in-app ads



In [46]:
from csv import reader

### The Google Play data set ###
opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

### The App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]


In [47]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [48]:
print(android_header)
print('n')
explore_data(android, 0, 4, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
n
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 10841
Number of columns: 13


In [49]:
print(android[10472])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [50]:
print(len(android))
del android[10472]
print(len(android))

10841
10840


## The Google Play data set has duplicate entries

> There some entries in the which are duplicated in the Play Store data set. Take a look at the duplicate entries for Twitter and Snapchat.

In [51]:
for app in android:
    name = app[0]
    if name == 'Twitter':
        print(app)
    elif name == 'Snapchat':
        print(app)

['Snapchat', 'SOCIAL', '4.0', '17014787', 'Varies with device', '500,000,000+', 'Free', '0', 'Teen', 'Social', 'July 30, 2018', 'Varies with device', 'Varies with device']
['Snapchat', 'SOCIAL', '4.0', '17014705', 'Varies with device', '500,000,000+', 'Free', '0', 'Teen', 'Social', 'July 30, 2018', 'Varies with device', 'Varies with device']
['Snapchat', 'SOCIAL', '4.0', '17015352', 'Varies with device', '500,000,000+', 'Free', '0', 'Teen', 'Social', 'July 30, 2018', 'Varies with device', 'Varies with device']
['Twitter', 'NEWS_AND_MAGAZINES', '4.3', '11667403', 'Varies with device', '500,000,000+', 'Free', '0', 'Mature 17+', 'News & Magazines', 'August 6, 2018', 'Varies with device', 'Varies with device']
['Twitter', 'NEWS_AND_MAGAZINES', '4.3', '11667403', 'Varies with device', '500,000,000+', 'Free', '0', 'Mature 17+', 'News & Magazines', 'August 6, 2018', 'Varies with device', 'Varies with device']
['Snapchat', 'SOCIAL', '4.0', '17000166', 'Varies with device', '500,000,000+', 'Fre

## Here we see the duplicate and unique entries

In [52]:
unique_entries = []
duplicate_entries = []

for app in android:
    name = app[0]
    if name in unique_entries:
        duplicate_entries.append(name)
    else:
        unique_entries.append(name)
        
print('unique_entries: {}'.format(len(unique_entries)))
print('duplicate_entries: {}'.format(len(duplicate_entries)))

unique_entries: 9659
duplicate_entries: 1181


## Criterion for removing the duplicates: 
> The higher the number of reviews, the more recent the data should be. Rather than removing duplicates randomly, we'll only keep the row with the highest number of reviews and remove the other entries for any given app.

In [53]:
reviews_max = {}
for app in android:
    name = app[0]
    n_reviews = float(app[2])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
        
print('9.659 entries expected and we have: {}'.format(len(reviews_max)))

9.659 entries expected and we have: 9659


* Create two lists to store clean data set and name of duplicate entries
* Iterate through Play Store data set and implement logic to add values to the lists

In [54]:
android_clean = []
already_added = []

for app in android:
    name = app[0]
    n_reviews = float(app[2])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name) # make sure this is inside the if block

In [55]:
explore_data(android_clean, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 8196
Number of columns: 13


### Check for non english apps

爱奇艺PPS -《欢乐颂2》电视剧热播

【脱出ゲーム】絶対に最後までプレイしないで 〜謎解き＆ブロックパズル〜

中国語 AQリスニング

لعبة تقدر تربح DZ

We're not interested in keeping these kind of apps, so we'll remove them. One way to go about this is to remove each app whose name contains a symbol that is not commonly used in English text — English text usually includes letters from the English alphabet, numbers composed of digits from 0 to 9, punctuation marks (., !, ?, ;, etc.), and other symbols (+, *, /, etc.).

All these characters that are specific to English texts are encoded using the ASCII standard. Each ASCII character has a corresponding number between 0 and 127 associated with it, and we can take advantage of that to build a function that checks an app name and tells us whether it contains non-ASCII characters.

To minimize the impact of data loss, we'll only remove an app if its name has more than three characters with corresponding numbers falling outside the ASCII range. This means all English apps with up to three emoji or other special characters will still be labeled as English.

In [56]:
def check_for_non_english_characters(input):
    count_non_english_characters = 0
    for character in input:
        if ord(character) > 127:
            count_non_english_characters += 1
    if count_non_english_characters > 3:
        return False
    return True

In [57]:
check_for_non_english_characters('Instagram')

True

In [58]:
check_for_non_english_characters('爱奇艺PPS -《欢乐颂2》电视剧热播')

False

In [59]:
check_for_non_english_characters('Docs To Go™ Free Office Suite')

True

In [60]:
check_for_non_english_characters('Instachat 😜')

True

Use the new function to filter out non-English apps from both data sets. 
Loop through each data set. If an app name is identified as English, append the whole row to a separate list.
Explore the data sets and see how many rows you have remaining for each data set.

### English apps for Android

In [61]:
english_android_apps = []

for app in android:
    name = app[0]
    if check_for_non_english_characters(name):
        english_android_apps.append(app)

explore_data(english_android_apps, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10795
Number of columns: 13


### English apps for iOS

In [62]:
english_ios_apps = []

for app in ios:
    name = app[0]
    if check_for_non_english_characters(name):
        english_ios_apps.append(app)

explore_data(english_ios_apps, 0, 3, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


We only build apps that are free to download and install, and our main source of revenue consists of in-app ads. Our data sets contain both free and non-free apps; we'll need to isolate only the free apps for our analysis.

### Free Android apps

In [63]:
free_android_apps = []

for app in english_android_apps:
    price = app[7]
    if price == '0':
        free_android_apps.append(app)
        
explore_data(free_android_apps, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 9999
Number of columns: 13


### Free iOS apps

In [64]:
free_ios_apps = []

for app in english_ios_apps:
    price = app[4]
    if price == '0.0':
        free_ios_apps.append(app)
        
explore_data(free_ios_apps, 0, 3, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 4056
Number of columns: 16


The need for an app for both markets iOS and Android. Because we are looking for increasing user numbers. Let's begin the analysis by getting a sense of what are the most common genres for each market. For this, we'll need to build frequency tables for a few columns in our data sets.

In [65]:
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages
    

In [66]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [67]:
display_table(free_ios_apps, -5)

Games : 55.64595660749507
Entertainment : 8.234714003944774
Photo & Video : 4.117357001972387
Social Networking : 3.5256410256410255
Education : 3.2544378698224854
Shopping : 2.983234714003945
Utilities : 2.687376725838264
Lifestyle : 2.3175542406311638
Finance : 2.0710059171597637
Sports : 1.947731755424063
Health & Fitness : 1.8737672583826428
Music : 1.6518737672583828
Book : 1.6272189349112427
Productivity : 1.5285996055226825
News : 1.4299802761341223
Travel : 1.3806706114398422
Food & Drink : 1.0601577909270217
Weather : 0.7642998027613412
Reference : 0.4930966469428008
Navigation : 0.4930966469428008
Business : 0.4930966469428008
Catalogs : 0.22189349112426035
Medical : 0.19723865877712032


In [68]:
display_table(free_android_apps, 1) # Category

FAMILY : 17.67176717671767
GAME : 10.591059105910592
TOOLS : 7.640764076407641
BUSINESS : 4.45044504450445
PRODUCTIVITY : 3.95039503950395
SPORTS : 3.6003600360036003
LIFESTYLE : 3.5903590359035906
COMMUNICATION : 3.5903590359035906
MEDICAL : 3.5403540354035403
FINANCE : 3.49034903490349
HEALTH_AND_FITNESS : 3.2503250325032504
PHOTOGRAPHY : 3.1203120312031203
PERSONALIZATION : 3.08030803080308
SOCIAL : 2.9202920292029204
NEWS_AND_MAGAZINES : 2.7702770277027704
SHOPPING : 2.5702570257025705
TRAVEL_AND_LOCAL : 2.4602460246024602
DATING : 2.2702270227022705
BOOKS_AND_REFERENCE : 1.9901990199019903
VIDEO_PLAYERS : 1.7001700170017002
EDUCATION : 1.5101510151015103
ENTERTAINMENT : 1.4701470147014701
MAPS_AND_NAVIGATION : 1.3001300130013
FOOD_AND_DRINK : 1.25012501250125
HOUSE_AND_HOME : 0.88008800880088
LIBRARIES_AND_DEMO : 0.8400840084008401
AUTO_AND_VEHICLES : 0.8200820082008201
WEATHER : 0.7400740074007401
EVENTS : 0.6300630063006301
ART_AND_DESIGN : 0.6100610061006101
COMICS : 0.59005900