# Analyzing App Store Data
We will look at apps to see what type of apps attract more users.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new empty line after each row
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
def open_list(dataset):
    opened_file = open(dataset, encoding='utf8')
    from csv import reader
    read_file = reader(opened_file)
    apps_data = list(read_file)
    return apps_data

apple_data = open_list('AppleStore.csv')
google_data = open_list('googleplaystore.csv')

In [3]:
explore_data(open_list('googleplaystore.csv'), 0, 3, rows_and_columns=True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13


In [4]:
explore_data(open_list('AppleStore.csv'), 0, 3, rows_and_columns=True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16


Found row 10473 to have a bad row. The category does not exist.

In [11]:
print(google_data[10473])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [12]:
del(google_data[10473])

In [15]:
print(google_data[10473])

['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']


Identified the dataset contains duplicate entries. Will remove duplicate entries and leave the entry with the highest number of reviews since that would be the most recent entry.

In [19]:
duplicate_apps = []
unique_apps = []

for app in google_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
print('Number of duplicate apps: ', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:15])

Number of duplicate apps:  1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


In [24]:
print('Expected length', len(google_data) - 1181)

Expected length 9660


Identified app name with most reviews and built dictionary of the app name and the max reviews we encountered.

In [25]:
reviews_max = {}
for app in google_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In [None]:
Number of reviews_max should equal that of the data set (minus header).

In [26]:
print('Expected Length: ', len(google_data[1:]) - 1181 )
print('Actual Length: ', len(reviews_max))

Expected Length:  9659
Actual Length:  9659


In [None]:
Create two new lists, one that will be a list of each row that is the highest
number of reviews. Second list will include the name of the apps we have already
added to ensure no duplicates exist in our clean list.

In [27]:
android_clean = []
already_added = []
for app in google_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)

Confirming the clean list matches what we expected.

In [31]:
print(len(android_clean))

9659


In [39]:
def englishCheck(string):
    n_non_english_char = 0
    for char in string:
        if ord(char) > 127:
            n_non_english_char += 1
    if n_non_english_char > 3:
        return False
    else:
        return True

Defined function to remove non-english apps. Established new list android_english to give us those apps.

In [48]:
android_english = []
android_non_english = []
for app in android_clean:
    name = app[0]
    if englishCheck(name):
        android_english.append(app)
    else:
        android_non_english.append(app)
print('English Apps Length: ', len(android_english))
print('\n')
print('Non English Length: ', len(android_non_english))

English Apps Length:  9614


Non English Length:  45


In [55]:
free_english = []
paid_english = []
for app in android_english:
    price = (app[7])
    if price == '0':
        free_english.append(app)
    else:
        paid_english.append(app)

In [57]:
print(len(free_english))
print(len(paid_english))

8864
750
