### DATA SCIENCE PROJECT - @devalissonalves

#### Profitable App Profiles for the App Store and Google Play Markets 

### INTRODUCTION AND GOALS OF PROJECT

In [25]:
def explore_data(dataset, start, end, rows_and_columns = False):
    
    dataset_slice = dataset[start:end]    
    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [26]:
# OPEN DATASETS (APPSTORE AND GOOGLEPLAY)
from csv import reader

#APPLE DATASET
opened_file_apple = open('./dataset/AppleStore.csv')
read_file_apple = reader(opened_file_apple)
apps_data_apple = list(read_file_apple)
header_apple = apps_data_apple[0]

# GOOGLE DATASET
opened_file_google = open('./dataset/googleplaystore.csv')
read_file_google = reader(opened_file_google)
apps_data_google = list(read_file_google)
header_google = apps_data_google[0]

explore_data(apps_data_apple[1:], 0, 2, True)
explore_data(apps_data_google[1:], 0, 2, True)

print(header_apple)
print(len(header_google))


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7197
Number of columns: 16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
13


In [27]:
# Deleting Wrong Data

for row in apps_data_google[1:]:
    if len(row) != len(header_google):
        print(row)
        print('\n')
        print("Index position is:", apps_data_google.index(row))
        
        # deleting index with missing data
        index = apps_data_google.index(row)
        del apps_data_google[index]
        print(f'Index {index} has been deleted!')


['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


Index position is: 10473
Index 10473 has been deleted!


In [28]:
# Removing Duplicate Entries: Part One

# first moment - Identifying some duplicate applications
# In this case, for example, we found 4 pieces of evidence from the Facebook application that were possibly collected at different times as they contain different evaluation numbers.

for app in apps_data_google:
    name = app [0]
    if name == 'Instagram':
        print(app)

print('\n\n')

# Finding the total number of repeated applications.

duplicate_apps = []
unique_apps = []

for app in apps_data_google:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

print('Number of duplicate apps: ', len(duplicate_apps))
print('Examples of duplicate apps: ', duplicate_apps[:10])

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']



Number of duplicate apps:  1181
Examples of duplicate apps:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


In [29]:
# From this point on, the strategy will be to leave in the dataset only the instance of applications that contain the highest number of evaluations.


# PART 1 - Create a dictionary where each key is a unique app name and the corresponding dictionary value is the highest number of reviews of that app

reviews_max = {}

for app in apps_data_google[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews


# PART 2 - Use the dictionary you created above to remove the duplicate rows:

data_google_clean = []
already_added = []

for app in apps_data_google[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        data_google_clean.append(app)
        already_added.append(name)



In [30]:
# Removing Non-English Apps: Part One

def string_analysis(name_app):
    count_word = 0
    for word in name_app:
        if ord(word) > 127:
            count_word += 1

    if(count_word > 3):
        return False
    else:
        return True

english_apps = []
non_english_apps = []

for app in data_google_clean:
    name = app[0]
    if string_analysis(name):
        english_apps.append(name)
    else:
        non_english_apps.append(name)

print(len(english_apps))
print('\n')
print(len(non_english_apps))

9614


45


In [31]:
# Isolating the Free Apps

free_apps_apple = []
free_apps_google = []

for app in apps_data_apple[1:]:
    if app[4] == '0' or app[4] == '0.0':
        free_apps_apple.append(app)


for app in data_google_clean:
    if app[7] == '0' or app[7] == '0.0':
        free_apps_google.append(app)

print(len(free_apps_apple))
print(len(free_apps_google))

4056
8905


In [32]:
# Most Common Apps by Genre: Part One

# Goal: The idea is to design applications that are well received by both audiences (iOS and Google). Therefore, the strategy is to identify which application genres are most common in both markets.

# It was observed that gender columns from both sets can be observed in the datasets. On Google index 9 and 1 and on Apple index 11

# index 11
for app in header_apple:
    print(app)

print('\n\n')

#index 9 e index 1 - Category and genres
for app in header_google:
    print(app)

id
track_name
size_bytes
currency
price
rating_count_tot
rating_count_ver
user_rating
user_rating_ver
ver
cont_rating
prime_genre
sup_devices.num
ipadSc_urls.num
lang.num
vpp_lic



App
Category
Rating
Reviews
Size
Installs
Type
Price
Content Rating
Genres
Last Updated
Current Ver
Android Ver


In [None]:
# Most Common Apps by Genre: Part Two

def freq_table(dataset, index):
    freq_table_dict = {}
    total = 0

    for app in dataset:
        # Count the total number of applications read in the dataset
        total += 1

        # Structure that creates frequency table logic
        if app[index] in freq_table_dict:
            freq_table_dict[app[index]] += 1
        else:
            freq_table_dict[app[index]] = 1

    # based on the sum of the total number of applications, a new table containing the percentages is generated.
    table_percentages = {}
    for key in freq_table_dict:
        percentage = (freq_table_dict[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

# display_table(free_apps_google, 9)
# display_table(free_apps_google, 1)
# display_table(free_apps_apple[1:], 11)

In [34]:
# Most Common Apps by Genre: Part Three

# - Analisando o dataset do Google Play, podemos observar os aplicatos de ferramentas como principal app mais comum. Seguido de aplicativos para entretenimento e educação. 

# - Analisando o dataset da Apple, podemos observar um expressivo percentual (+55%) para aplicativos do gênero Games, sendo seguido pelo gênero de entretenimento. 

# - Dentre os principais 10 aplicativos do Google Play e da Apple Store em comum, podemos descatar uma tendência de utilização para aplicativos de entretenimento, educação, estilo de vida, finaças e esportes. 



# Analise a tabela de frequência gerada para a coluna Categorye Genresdo conjunto de dados do Google Play

# - Podemos observar que os gêneros mais comuns são aplicativos voltados para a Família, Games, Ferramentas, Negócios e Estilo de vida
# -  Dentre os principais 10 aplicativos do Google Play e da Apple Store em comum, podemos descatar uma tendência de utilização para aplicativos de entretenimento, educação, estilo de vida, finaças e esportes. 

# - Para saber a quantidade de usuários por gênero precisamos fazer novas análises.


In [None]:
display_table(free_apps_google, 1)

In [None]:
display_table(free_apps_google, 9)

In [None]:
# Most Popular Apps by Genre on the App Store

prime_genre_dict = freq_table(free_apps_apple, -5) 

for genre in prime_genre_dict:
    total = 0
    len_genre = 0
    for app in free_apps_apple:
        genre_app = app[-5]
        if genre_app == genre:
            n_ratings = float(app[5])
            total += n_ratings
            len_genre += 1
    avg_n_ratings = total / len_genre
    print(genre, ':', avg_n_ratings)
    

In [None]:
# Most Popular Apps by Genre on Google Play

display_table(free_apps_google, 5)
categories_android = freq_table(data_google_clean, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in data_google_clean:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)
