## Analyzing iOS and Android English free mobile app data to see which free mobile apps attract user most.

In [1]:
from csv import reader

### IOS Datasets

In [2]:
opened_file_ios = open('datasets/AppleStore.csv', encoding='utf8')
read_file = reader(opened_file_ios)
ios_datasets = list(read_file)

### Android Datasets

In [3]:
opened_file_android = open('datasets/googleplaystore.csv', encoding='utf8')
read_file = reader(opened_file_android)
android_datasets = list(read_file)

### Functions

In [100]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
        
    if rows_and_columns:
        print('Number of rows: ', len(dataset))
        print('Number of columns: ', len(dataset[0]))
        
def find_duplicates_and_unique(dataset, index_num):
    duplicate_list = []
    unique_list = []
    
    for data in dataset:
        if data[index_num] in unique_list:
            duplicate_list.append(data[index_num])
        else:
            unique_list.append(data[index_num])
            
    return duplicate_list, unique_list

def reviews_max(dataset, name_index, review_count_index):
    reviews_max = {}

    for data in dataset:
        name = data[name_index]
        n_reviews = float(data[review_count_index])

        if name in reviews_max and reviews_max[name] < n_reviews:
            reviews_max[name] = n_reviews
        elif name not in reviews_max:
            reviews_max[name] = n_reviews

    return reviews_max

def clean_dataset(dataset, reviews_max, name_index, review_counts_index):
    clean_list = []
    already_added = []

    for data in dataset:
        name = data[name_index]
        n_reviews = float(data[review_counts_index])

        if (n_reviews == reviews_max[name]) and (name not in already_added):
            clean_list.append(data)
            already_added.append(name)
            
    return clean_list

def check_english_name(name):
    count = 0
    for char in name:
        if ord(char) > 127:
            count += 1
        
        if count > 3:
            return False
    return True

def filter_english_app(dataset, name_index):
    english_apps = []
    non_english_apps =[]
    
    for data in dataset:
        name = data[name_index]
        if check_english_name(name):
            english_apps.append(data)
        else:
            non_english_apps.append(data)
            
    return english_apps, non_english_apps

def freq_table(dataset, index):
    freq_dict = {}
    total = 0
    
    for data in dataset:
        total += 1
        if data[index] in freq_dict:
            freq_dict[data[index]] += 1
        else:
            freq_dict[data[index]] = 1
            
    freq_percentage = {}
    for freq in freq_dict:
        percentage = (freq_dict[freq] / total) * 100
        freq_percentage[freq] = percentage
        
    return freq_percentage

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

### iOS

In [5]:
explore_data(ios_datasets, 0, 3, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows:  7198
Number of columns:  16


#### Check if dataset has any duplicates record

In [6]:
ios_dups_uniq_list = find_duplicates_and_unique(ios_datasets, 1)
duplicate_ios_apps = ios_dups_uniq_list[0]
unique_ios_apps = ios_dups_uniq_list[1]

print('Duplicate iOS apps: ', len(duplicate_ios_apps))
print('Unique iOS apps: ', len(unique_ios_apps))
duplicate_ios_apps

Duplicate iOS apps:  2
Unique iOS apps:  7196


['Mannequin Challenge', 'VR Roller Coaster']

#### Removing duplicates (records with the highest reviews counts will remain)

In [20]:
expected_ios_length = len(ios_datasets[1:]) - len(duplicate_ios_apps)
expected_ios_length

7195

In [22]:
ios_reviews_max = reviews_max(ios_datasets[1:], 1, 5)
len(ios_reviews_max)

7195

In [59]:
clean_ios_dataset = clean_dataset(ios_datasets[1:], ios_reviews_max, 1, 5)
len(clean_ios_dataset)

7195

In [64]:
filtered_ios_app = filter_english_app(clean_ios_dataset, 1)
ios_eng_app = filtered_ios_app[0]
non_ios_eng_app = filtered_ios_app[1]

In [66]:
len(ios_eng_app)

6181

In [69]:
len(non_ios_eng_app)

1014

In [90]:
ios_eng_app[0:5]

[['284882215',
  'Facebook',
  '389879808',
  'USD',
  '0.0',
  '2974676',
  '212',
  '3.5',
  '3.5',
  '95.0',
  '4+',
  'Social Networking',
  '37',
  '1',
  '29',
  '1'],
 ['389801252',
  'Instagram',
  '113954816',
  'USD',
  '0.0',
  '2161558',
  '1289',
  '4.5',
  '4.0',
  '10.23',
  '12+',
  'Photo & Video',
  '37',
  '0',
  '29',
  '1'],
 ['529479190',
  'Clash of Clans',
  '116476928',
  'USD',
  '0.0',
  '2130805',
  '579',
  '4.5',
  '4.5',
  '9.24.12',
  '9+',
  'Games',
  '38',
  '5',
  '18',
  '1'],
 ['420009108',
  'Temple Run',
  '65921024',
  'USD',
  '0.0',
  '1724546',
  '3842',
  '4.5',
  '4.0',
  '1.6.2',
  '9+',
  'Games',
  '40',
  '5',
  '1',
  '1'],
 ['284035177',
  'Pandora - Music & Radio',
  '130242560',
  'USD',
  '0.0',
  '1126879',
  '3594',
  '4.0',
  '4.5',
  '8.4.1',
  '12+',
  'Music',
  '37',
  '4',
  '1',
  '1']]

In [88]:
ios_free_eng_apps = []
    
for data in ios_eng_app:
    if float(data[4]) == 0.0:
        ios_free_eng_apps.append(data)

len(ios_free_eng_apps)

3220

In [102]:
ios_prine_genre = display_table(ios_free_eng_apps, 11)

Games : 58.13664596273293
Entertainment : 7.888198757763975
Photo & Video : 4.968944099378882
Education : 3.6645962732919255
Social Networking : 3.291925465838509
Shopping : 2.608695652173913
Utilities : 2.515527950310559
Sports : 2.142857142857143
Music : 2.049689440993789
Health & Fitness : 2.018633540372671
Productivity : 1.7391304347826086
Lifestyle : 1.5838509316770186
News : 1.3354037267080745
Travel : 1.2422360248447204
Finance : 1.1180124223602486
Weather : 0.8695652173913043
Food & Drink : 0.8074534161490683
Reference : 0.5590062111801243
Business : 0.5279503105590062
Book : 0.43478260869565216
Navigation : 0.18633540372670807
Medical : 0.18633540372670807
Catalogs : 0.12422360248447205


### Android

In [7]:
explore_data(android_datasets, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows:  10842
Number of columns:  13


#### Delete data

In [8]:
# android_datasets[10473].insert(1, 'Test')
android_datasets[10473][1]

'Test'

#### Check if dataset has any duplicates record

In [9]:
android_dups_uniq_list = find_duplicates_and_unique(android_datasets, 0)
duplicate_android_apps = android_dups_uniq_list[0]
unique_android_apps = android_dups_uniq_list[1]

print('Duplicate Android apps: ', len(duplicate_android_apps))
print('Unique Android apps: ', len(unique_android_apps))
duplicate_android_apps

Duplicate Android apps:  1181
Unique Android apps:  9661


['Quick PDF Scanner + OCR FREE',
 'Box',
 'Google My Business',
 'ZOOM Cloud Meetings',
 'join.me - Simple Meetings',
 'Box',
 'Zenefits',
 'Google Ads',
 'Google My Business',
 'Slack',
 'FreshBooks Classic',
 'Insightly CRM',
 'QuickBooks Accounting: Invoicing & Expenses',
 'HipChat - Chat Built for Teams',
 'Xero Accounting Software',
 'MailChimp - Email, Marketing Automation',
 'Crew - Free Messaging and Scheduling',
 'Asana: organize team projects',
 'Google Analytics',
 'AdWords Express',
 'Accounting App - Zoho Books',
 'Invoice & Time Tracking - Zoho',
 'join.me - Simple Meetings',
 'Invoice 2go — Professional Invoices and Estimates',
 'SignEasy | Sign and Fill PDF and other Documents',
 'Quick PDF Scanner + OCR FREE',
 'Genius Scan - PDF Scanner',
 'Tiny Scanner - PDF Scanner App',
 'Fast Scanner : Free PDF Scan',
 'Mobile Doc Scanner (MDScan) Lite',
 'TurboScan: scan documents and receipts in PDF',
 'Tiny Scanner Pro: PDF Doc Scan',
 'Docs To Go™ Free Office Suite',
 'OfficeS

#### Removing duplicates (records with highest reviews counts will remain)

In [10]:
for data in android_datasets:
    name = data[0]
    if name == 'Instagram':
        print(data)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


In [14]:
expected_android_length = len(android_datasets[1:]) - len(duplicate_android_apps)
expected_android_length

9660

In [25]:
android_reviews_max = reviews_max(android_datasets[1:], 0, 3)
android_reviews_max

{'Photo Editor & Candy Camera & Grid & ScrapBook': 159.0,
 'Coloring book moana': 974.0,
 'U Launcher Lite – FREE Live Cool Themes, Hide Apps': 87510.0,
 'Sketch - Draw & Paint': 215644.0,
 'Pixel Draw - Number Art Coloring Book': 967.0,
 'Paper flowers instructions': 167.0,
 'Smoke Effect Photo Maker - Smoke Editor': 178.0,
 'Infinite Painter': 36815.0,
 'Garden Coloring Book': 13791.0,
 'Kids Paint Free - Drawing Fun': 121.0,
 'Text on Photo - Fonteee': 13880.0,
 'Name Art Photo Editor - Focus n Filters': 8788.0,
 'Tattoo Name On My Photo Editor': 44829.0,
 'Mandala Coloring Book': 4326.0,
 '3D Color Pixel by Number - Sandbox Art Coloring': 1518.0,
 'Learn To Draw Kawaii Characters': 55.0,
 'Photo Designer - Write your name with shapes': 3632.0,
 '350 Diy Room Decor Ideas': 27.0,
 'FlipaClip - Cartoon animation': 194216.0,
 'ibis Paint X': 224399.0,
 'Logo Maker - Small Business': 450.0,
 "Boys Photo Editor - Six Pack & Men's Suit": 654.0,
 'Superheroes Wallpapers | 4K Backgrounds': 

In [37]:
clean_android_dataset = clean_dataset(android_datasets[1:], android_reviews_max, 0, 3)
len(clean_android_dataset)

9660

In [56]:
filtered_android_app = filter_english_app(clean_android_dataset, 0)
android_eng_app = filtered_android_app[0]
non_android_eng_app = filtered_android_app[1]

In [62]:
len(android_eng_app)

9615

In [70]:
len(non_android_eng_app)

45

In [89]:
android_eng_app[0:5]

[['Photo Editor & Candy Camera & Grid & ScrapBook',
  'ART_AND_DESIGN',
  '4.1',
  '159',
  '19M',
  '10,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design',
  'January 7, 2018',
  '1.0.0',
  '4.0.3 and up'],
 ['U Launcher Lite – FREE Live Cool Themes, Hide Apps',
  'ART_AND_DESIGN',
  '4.7',
  '87510',
  '8.7M',
  '5,000,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design',
  'August 1, 2018',
  '1.2.4',
  '4.0.3 and up'],
 ['Sketch - Draw & Paint',
  'ART_AND_DESIGN',
  '4.5',
  '215644',
  '25M',
  '50,000,000+',
  'Free',
  '0',
  'Teen',
  'Art & Design',
  'June 8, 2018',
  'Varies with device',
  '4.2 and up'],
 ['Pixel Draw - Number Art Coloring Book',
  'ART_AND_DESIGN',
  '4.3',
  '967',
  '2.8M',
  '100,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design;Creativity',
  'June 20, 2018',
  '1.1',
  '4.4 and up'],
 ['Paper flowers instructions',
  'ART_AND_DESIGN',
  '4.4',
  '167',
  '5.6M',
  '50,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design',
  'March 26, 2017

In [87]:
android_free_eng_apps = []
    
for data in android_eng_app:
    if data[6] == 'Free':
        android_free_eng_apps.append(data)

len(android_free_eng_apps)

8864

#### our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful on both markets.

In [103]:
android_genres = display_table(android_free_eng_apps, 9)

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

In [104]:
android_category = display_table(android_free_eng_apps, 1)

FAMILY : 18.896660649819495
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 