# Analysis of Mobile Apps


As of September 2018, there were approximately 2 million iOS apps available on the App Store, and 2.1 million Android apps on Google Play. The goal is to analyze the application markets to see what apps would be good to build based on in-app ads. The most important indicator of app revenue is the number of users that use the app. 

The data will be a sample of the following:
- 10,000 Android apps from Google Play; the data was collected in August 2018
- 7,000 iOS apps from the App Store; the data was collected in July 2017

In [1]:
def open_dataset(file_name='AppleStore.csv', header = True):
    opened_file = open(file_name)
    from csv import reader
    read_file = reader(opened_file)
    data = list(read_file)
    if header:
        return data
    return data[1:]

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line between rows
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
ios = open_dataset()
ios[0:2]

[['id',
  'track_name',
  'size_bytes',
  'currency',
  'price',
  'rating_count_tot',
  'rating_count_ver',
  'user_rating',
  'user_rating_ver',
  'ver',
  'cont_rating',
  'prime_genre',
  'sup_devices.num',
  'ipadSc_urls.num',
  'lang.num',
  'vpp_lic'],
 ['284882215',
  'Facebook',
  '389879808',
  'USD',
  '0.0',
  '2974676',
  '212',
  '3.5',
  '3.5',
  '95.0',
  '4+',
  'Social Networking',
  '37',
  '1',
  '29',
  '1']]

In [4]:
android = open_dataset(file_name='googleplaystore.csv', header = True)
android[0:2]

[['App',
  'Category',
  'Rating',
  'Reviews',
  'Size',
  'Installs',
  'Type',
  'Price',
  'Content Rating',
  'Genres',
  'Last Updated',
  'Current Ver',
  'Android Ver'],
 ['Photo Editor & Candy Camera & Grid & ScrapBook',
  'ART_AND_DESIGN',
  '4.1',
  '159',
  '19M',
  '10,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design',
  'January 7, 2018',
  '1.0.0',
  '4.0.3 and up']]

In [5]:
explore_data(ios, 1, 5)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']




In [6]:
explore_data(android, 1, 5)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']




## Removing Duplicate and Erroneous Data

1. Remove inaccurate data
2. Remove duplicate app entries
3. Removed non-English apps
4. Isolated the free apps

## 1. Remove inaccurate data

Go through the data and see if there are any rows that are any columns with inaccurate data. The Category column of the Android data set is comprised of String data if any column contains integers remove it the data.

In [10]:
d_cat = {}
d_app = {}
i = 1

for row in android[1:]:
    cat = row[1]
    app = row[0]
    if cat not in d_cat:
        d_cat[cat] = 1
    else:
        d_cat[cat] += 1
    if app not in d_app:
        d_app[app] = 1
    else:
        d_app[app] += 1

In [11]:
d_cat

{'ART_AND_DESIGN': 65,
 'AUTO_AND_VEHICLES': 85,
 'BEAUTY': 53,
 'BOOKS_AND_REFERENCE': 231,
 'BUSINESS': 460,
 'COMICS': 60,
 'COMMUNICATION': 387,
 'DATING': 234,
 'EDUCATION': 156,
 'ENTERTAINMENT': 149,
 'EVENTS': 64,
 'FINANCE': 366,
 'FOOD_AND_DRINK': 127,
 'HEALTH_AND_FITNESS': 341,
 'HOUSE_AND_HOME': 88,
 'LIBRARIES_AND_DEMO': 85,
 'LIFESTYLE': 382,
 'GAME': 1144,
 'FAMILY': 1972,
 'MEDICAL': 463,
 'SOCIAL': 295,
 'SHOPPING': 260,
 'PHOTOGRAPHY': 335,
 'SPORTS': 384,
 'TRAVEL_AND_LOCAL': 258,
 'TOOLS': 843,
 'PERSONALIZATION': 392,
 'PRODUCTIVITY': 424,
 'PARENTING': 60,
 'WEATHER': 82,
 'VIDEO_PLAYERS': 175,
 'NEWS_AND_MAGAZINES': 283,
 'MAPS_AND_NAVIGATION': 137,
 '1.9': 1}

In [13]:
i = 1
d_cat_check = {}
for row in android[1:]:
    try:
        if float(row[1]):
            d_cat_check[i] = row[1]
    except ValueError:
        pass
    i += 1
d_cat_check

{10473: '1.9'}

In [15]:
del android[10473]  # don't run this more than once

## 2. Remove duplicate app entries

The data contains duplicate rows of data as well as rows where there are slight differences in the number of reviews. 

In [20]:
i = 1
d_app_list = []
d_app_check = {}
d_app_dupe = {}
for row in android[1:]:
    app = row[0]
    if app not in d_app_list:
        d_app_list.append(app)
        d_app_check[i] = app
    else:
        d_app_dupe[i] = app
    i += 1
d_app_check

{1: 'Photo Editor & Candy Camera & Grid & ScrapBook',
 2: 'Coloring book moana',
 3: 'U Launcher Lite – FREE Live Cool Themes, Hide Apps',
 4: 'Sketch - Draw & Paint',
 5: 'Pixel Draw - Number Art Coloring Book',
 6: 'Paper flowers instructions',
 7: 'Smoke Effect Photo Maker - Smoke Editor',
 8: 'Infinite Painter',
 9: 'Garden Coloring Book',
 10: 'Kids Paint Free - Drawing Fun',
 11: 'Text on Photo - Fonteee',
 12: 'Name Art Photo Editor - Focus n Filters',
 13: 'Tattoo Name On My Photo Editor',
 14: 'Mandala Coloring Book',
 15: '3D Color Pixel by Number - Sandbox Art Coloring',
 16: 'Learn To Draw Kawaii Characters',
 17: 'Photo Designer - Write your name with shapes',
 18: '350 Diy Room Decor Ideas',
 19: 'FlipaClip - Cartoon animation',
 20: 'ibis Paint X',
 21: 'Logo Maker - Small Business',
 22: "Boys Photo Editor - Six Pack & Men's Suit",
 23: 'Superheroes Wallpapers | 4K Backgrounds',
 24: 'Mcqueen Coloring pages',
 25: 'HD Mickey Minnie Wallpapers',
 26: 'Harley Quinn wallpa

In [21]:
d_app_dupe

{230: 'Quick PDF Scanner + OCR FREE',
 237: 'Box',
 240: 'Google My Business',
 257: 'ZOOM Cloud Meetings',
 262: 'join.me - Simple Meetings',
 266: 'Box',
 267: 'Zenefits',
 268: 'Google Ads',
 269: 'Google My Business',
 270: 'Slack',
 271: 'FreshBooks Classic',
 272: 'Insightly CRM',
 273: 'QuickBooks Accounting: Invoicing & Expenses',
 274: 'HipChat - Chat Built for Teams',
 275: 'Xero Accounting Software',
 276: 'MailChimp - Email, Marketing Automation',
 277: 'Crew - Free Messaging and Scheduling',
 278: 'Asana: organize team projects',
 279: 'Google Analytics',
 280: 'AdWords Express',
 281: 'Accounting App - Zoho Books',
 282: 'Invoice & Time Tracking - Zoho',
 283: 'join.me - Simple Meetings',
 284: 'Invoice 2go — Professional Invoices and Estimates',
 285: 'SignEasy | Sign and Fill PDF and other Documents',
 286: 'Quick PDF Scanner + OCR FREE',
 287: 'Genius Scan - PDF Scanner',
 288: 'Tiny Scanner - PDF Scanner App',
 289: 'Fast Scanner : Free PDF Scan',
 290: 'Mobile Doc Sc

In [22]:
d_hash = {}
lst = []
dupe_android = []
i = 1
for row in android[1:]:
    arr_string = ''.join(row)
    hasher = hash(arr_string)
    if hasher not in lst:
        lst.append(hasher)
        dupe_android.append(row)    

In [24]:
explore_data(dupe_android,0,5)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']




In [25]:
reviews_max = {}

for app in dupe_android:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In [26]:
android_clean = []
already_added = []

for app in dupe_android:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name) # make sure this is inside the if block

3. Removed non-English apps



In [27]:
def is_english(string):
    non_ascii = 0
    
    for character in string:
        if ord(character) > 127:
            non_ascii += 1
    
    if non_ascii > 3:
        return False
    else:
        return True

In [28]:
android_english = []
ios_english = []

for app in android_clean:
    name = app[0]
    if is_english(name):
        android_english.append(app)
        
for app in ios:
    name = app[1]
    if is_english(name):
        ios_english.append(app)
        
explore_data(android_english, 0, 3, True)
print('\n')
explore_data(ios_english, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagr