In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def open_csv(file_path):
    """
    Opens and reads a CSV file using pandas.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    DataFrame: A pandas DataFrame containing the CSV data.
    """
    try:
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        print(f"The file at {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [3]:
#File paths for each CSV
apple_store_path = (r"../data/Apple store/AppleStore_cleaned.csv")
googleplay_store_path = (r"../data/Google Playstore/googleplaystore_cleaned.csv")

In [4]:
#DF for each CSV
apple_df = open_csv(apple_store_path)
google_df = open_csv(googleplay_store_path)

In [5]:
def freq_table(df, column):
    """
    Generates and displays a frequency table for a given column in a DataFrame,
    showing the percentage of each unique value, sorted in descending order.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column (str): The name of the column to generate and display the frequency table for.

    Returns:
    pd.DataFrame: A DataFrame with unique values and their corresponding percentages, sorted in descending order.
    """
    # Calculate value counts and convert to percentages
    value_counts = df[column].value_counts(normalize=True) * 100
    
    # Create the frequency table DataFrame
    freq_df = value_counts.reset_index().rename(columns={'index': column, column: 'Percentage'})
    
    # Sort the DataFrame by the 'Percentage' column in descending order
    freq_df_sorted = freq_df.sort_values(by='Percentage', ascending=False)
    
    # Display the sorted frequency table
    print(freq_df_sorted.to_string(index=False))
    
    # Return the sorted frequency table as a DataFrame
    return freq_df_sorted


In [6]:
apple_frequency_table = freq_table(apple_df, 'prime_genre')
print(apple_frequency_table)

      prime_genre  Percentage
            Games   59.171800
    Entertainment    7.529090
    Photo & Video    5.133470
        Education    3.832991
Social Networking    3.114305
         Shopping    2.498289
        Utilities    2.258727
            Music    2.156057
           Sports    2.053388
 Health & Fitness    1.984942
     Productivity    1.711157
        Lifestyle    1.471595
             News    1.334702
           Travel    1.129363
          Finance    1.095140
          Weather    0.889802
     Food & Drink    0.889802
         Business    0.513347
        Reference    0.513347
             Book    0.273785
          Medical    0.205339
       Navigation    0.136893
         Catalogs    0.102669
          prime_genre  Percentage
0               Games   59.171800
1       Entertainment    7.529090
2       Photo & Video    5.133470
3           Education    3.832991
4   Social Networking    3.114305
5            Shopping    2.498289
6           Utilities    2.258727
7       

In [7]:
google_frequency_table = freq_table(google_df, 'Category')
print(google_frequency_table)

           Category  Percentage
             FAMILY   18.793862
               GAME    9.611038
              TOOLS    8.576187
           BUSINESS    4.710360
       PRODUCTIVITY    3.972880
          LIFESTYLE    3.889616
            FINANCE    3.734983
            MEDICAL    3.639824
    PERSONALIZATION    3.306768
             SPORTS    3.259189
      COMMUNICATION    3.223504
 HEALTH_AND_FITNESS    3.128345
        PHOTOGRAPHY    3.009397
 NEWS_AND_MAGAZINES    2.795290
             SOCIAL    2.664446
   TRAVEL_AND_LOCAL    2.307601
           SHOPPING    2.248127
BOOKS_AND_REFERENCE    2.188652
             DATING    1.831807
      VIDEO_PLAYERS    1.760438
MAPS_AND_NAVIGATION    1.356013
     FOOD_AND_DRINK    1.201380
          EDUCATION    1.165695
  AUTO_AND_VEHICLES    0.939693
      ENTERTAINMENT    0.939693
 LIBRARIES_AND_DEMO    0.904009
     HOUSE_AND_HOME    0.808850
            WEATHER    0.796955
             EVENTS    0.713691
     ART_AND_DESIGN    0.666112
        

In [8]:
google_frequency_table = freq_table(google_df, 'Genres')
print(google_frequency_table)

                               Genres  Percentage
                                Tools    8.564292
                        Entertainment    6.090163
                            Education    5.388367
                             Business    4.710360
                         Productivity    3.972880
                            Lifestyle    3.877721
                              Finance    3.734983
                              Medical    3.639824
                               Sports    3.330558
                      Personalization    3.306768
                        Communication    3.223504
                     Health & Fitness    3.128345
                               Action    3.116451
                          Photography    3.009397
                     News & Magazines    2.795290
                               Social    2.664446
                       Travel & Local    2.307601
                             Shopping    2.248127
                    Books & Reference    2.188652


In [9]:
def average_installs_by_category(android_df):
    """
    Calculates the average number of installs for each category in the Android dataset.

    Parameters:
    android_df (pd.DataFrame): The DataFrame containing Android app data.

    Returns:
    pd.DataFrame: A DataFrame containing categories and their corresponding average installs, sorted by highest.
    """
    # Clean the 'Installs' column to remove commas and plus signs, then convert to float
    android_df['Installs'] = android_df['Installs'].str.replace(',', '').str.replace('+', '').astype(float)
    
    # Group by 'Category' and calculate the mean of installs
    avg_installs = android_df.groupby('Category')['Installs'].mean().reset_index()
    avg_installs = avg_installs.rename(columns={'Installs': 'Average Installs'})
    
    # Sort the DataFrame by 'Average Installs' in descending order
    avg_installs = avg_installs.sort_values(by='Average Installs', ascending=False)
    
    # Print the results
    for index, row in avg_installs.iterrows():
        print(row['Category'], ':', row['Average Installs'])
    
    return avg_installs


In [10]:
google_genre_avg = average_installs_by_category(google_df)

COMMUNICATION : 36106662.328413285
VIDEO_PLAYERS : 25234606.216216218
SOCIAL : 24441088.17857143
PHOTOGRAPHY : 18099283.85375494
PRODUCTIVITY : 16972497.946107786
GAME : 15434835.816831684
TRAVEL_AND_LOCAL : 14487541.68041237
ENTERTAINMENT : 12346329.11392405
TOOLS : 11084333.292649098
NEWS_AND_MAGAZINES : 10006311.10638298
BOOKS_AND_REFERENCE : 8504745.97826087
SHOPPING : 7307823.2010582015
WEATHER : 5219216.7164179105
PERSONALIZATION : 5027006.791366907
MAPS_AND_NAVIGATION : 4304432.280701755
HEALTH_AND_FITNESS : 4263642.1749049425
SPORTS : 3647640.208029197
FAMILY : 3636007.157594937
FOOD_AND_DRINK : 1974937.1386138613
ART_AND_DESIGN : 1932519.642857143
EDUCATION : 1844897.9591836734
BUSINESS : 1602958.308080808
HOUSE_AND_HOME : 1391211.1911764706
LIFESTYLE : 1375297.3058103975
FINANCE : 1348224.9426751593
COMICS : 880440.625
DATING : 764959.4610389611
LIBRARIES_AND_DEMO : 674917.2368421053
AUTO_AND_VEHICLES : 645317.2278481013
PARENTING : 544745.6363636364
BEAUTY : 513151.886792452

  android_df['Installs'] = android_df['Installs'].str.replace(',', '').str.replace('+', '').astype(float)


In [16]:
def average_ratings_by_genre(ios_df):
    """
    Calculates the average number of ratings for each genre in the iOS dataset.

    Parameters:
    ios_df (pd.DataFrame): The DataFrame containing iOS app data.

    Returns:
    pd.DataFrame: A DataFrame containing genres and their corresponding average ratings, sorted by highest.
    """
    # Group by genre and calculate the mean of ratings
    avg_ratings = ios_df.groupby('prime_genre')['rating_count_tot'].mean().reset_index()
    avg_ratings = avg_ratings.rename(columns={'rating_count_tot': 'Average Ratings'})
    
    # Sort the DataFrame by 'Average Ratings' in descending order
    avg_ratings = avg_ratings.sort_values(by='Average Ratings', ascending=False)
    
    # Print the results
    for index, row in avg_ratings.iterrows():
        print(row['prime_genre'], ':', row['Average Ratings'])
    
    return avg_ratings

In [17]:
apple_genre_avg = average_ratings_by_genre(apple_df)

Navigation : 125037.25
Reference : 89562.6
Social Networking : 78567.30769230769
Music : 55396.01587301587
Weather : 48275.57692307692
Travel : 34115.57575757576
Food & Drink : 33333.92307692308
Photo & Video : 29249.766666666666
Shopping : 28877.575342465752
Finance : 26038.6875
Sports : 25791.666666666668
News : 23382.17948717949
Productivity : 22842.22
Games : 21560.75072296125
Health & Fitness : 19418.620689655174
Lifestyle : 17260.53488372093
Book : 16671.0
Entertainment : 15006.227272727272
Utilities : 11571.69696969697
Business : 6839.6
Education : 6103.464285714285
Catalogs : 5195.0
Medical : 612.0


unfortunately, IOS store doesn´t have a reference for the amount of installs any app has, so we are going to use the amount of total ratings it has as reference instead to aproximate the popularity of an app

In [23]:
def top_downloaded_apps_by_category(ios_df, android_df, top_n=5):
    """
    Retrieves the top N most downloaded apps for each category in both iOS and Android datasets.

    Parameters:
    ios_df (pd.DataFrame): The DataFrame containing iOS app data.
    android_df (pd.DataFrame): The DataFrame containing Android app data.
    top_n (int): The number of top apps to return for each category.

    Returns:
    tuple: Two DataFrames, one for iOS and one for Android, containing the top apps by category.
    """
    
    # For iOS, since we are using rating_count_tot as a proxy for downloads
    ios_top_apps = (ios_df.sort_values(by='rating_count_tot', ascending=False)
                    .groupby('prime_genre')
                    .head(top_n)
                    .reset_index(drop=True))
    
    ios_top_apps = ios_top_apps[['prime_genre', 'track_name', 'rating_count_tot']]
    ios_top_apps = ios_top_apps.rename(columns={'prime_genre': 'Category', 'track_name': 'App Name', 'rating_count_tot': 'Total Ratings'})
    
    print("Top downloaded apps in iOS by category:")
    for category in ios_top_apps['Category'].unique():
        print(f"\nCategory: {category}")
        print(ios_top_apps[ios_top_apps['Category'] == category].to_string(index=False))
    
    # For Android, we are using Reviews to determine the apps with the most engagement, since a lot of apps often come pre-installed in Android devices.
    android_top_apps = (android_df.sort_values(by='Reviews', ascending=False)
                        .groupby('Category')
                        .head(top_n)
                        .reset_index(drop=True))
    
    android_top_apps = android_top_apps[['Category', 'App', 'Reviews']]
    android_top_apps = android_top_apps.rename(columns={'App': 'App Name', 'Reviews': 'Total Ratings'})
    
    print("\nTop downloaded apps in Android by category:")
    for category in android_top_apps['Category'].unique():
        print(f"\nCategory: {category}")
        print(android_top_apps[android_top_apps['Category'] == category].to_string(index=False))
    
    return ios_top_apps, android_top_apps

In [24]:
ios_top_apps, android_top_apps = top_downloaded_apps_by_category(apple_df, google_df)

Top downloaded apps in iOS by category:

Category: Social Networking
         Category         App Name  Total Ratings
Social Networking         Facebook        2974676
Social Networking        Pinterest        1061624
Social Networking Skype for iPhone         373519
Social Networking        Messenger         351466
Social Networking           Tumblr         334293

Category: Photo & Video
     Category                                           App Name  Total Ratings
Photo & Video                                          Instagram        2161558
Photo & Video                                           Snapchat         323905
Photo & Video    YouTube - Watch Videos, Music, and Live Streams         278166
Photo & Video Pic Collage - Picture Editor & Photo Collage Maker         123433
Photo & Video  Funimate video editor: add cool effects to videos         123268

Category: Games
Category         App Name  Total Ratings
   Games   Clash of Clans        2130805
   Games       Temple Run  

**Based on the data for IOS, we can infer some things:**

Social Networking, Gaming, and Photo & Video categories dominate user engagement, making them a solid point to focus on for determining which apps are the most profitable. 
Social Networking apps have the highest total ratings, making it by far the most relevant category in app usage, likely due to continuous engagement from users on apps like Facebook or Pintrest.
Similarly, the Photo & Video category has high user engagement, reflecting the interest in visual content sharing that has been trending for the past couple of years, with apps like Instagram, Snapchat and YouTube as the leading apps.
The Games category also shows significant engagement with apps like Clash of Clans and Temple Run, suggesting that gaming remains a favorite among users, driving high download rates.
In the Music category, apps like Pandora and Spotify are prominent, highlighting the ongoing popularity of music streaming services.

**As for the top downloaded Android apps:**

The most used Android apps tend to be apps that generally come pre-installed in the phone, like all of the Google ecosystem apps, like Google Drive, Google Calendar, Gmail, or Google Maps, which lead their respective categories with over 1 billion installs, considering that the OS itself is managed by Google.
As such, instead of using the amount of installs, we'll also use the total reviews for each app as reference for app engagement.
We can see that social networking, photo and video apps dominate the market in Android as well, with apps like Facebook, Instagram, WhatsApp and Youtube having huge amounts of engagement.
In gaming, we can also see Clash of Clans being the most popular game in the store, followed by Temple Run, Candy Crush, Angry birds and Subway Surfers.

**Overall:**

Both stores have slightly different ways to classify their apps, but we can overall see some similarities in certain fields.
For example, the dominance of Meta apps is consistent across both platforms, with Facebook, Instagram, and WhatsApp leading in both ecosystems.
Both platforms see high engagement with Clash of Clans and Candy Crush Saga, showing that these games are popular across ecosystems. However, PUBG Mobile is a top contender on iOS, whereas Subway Surfers and Clash Royale have a stronger presence on Android.
YouTube and Netflix are the top choices for streaming in both cases, but Disney+ is more prominent on iOS compared to Hotstar on Android.
While Google Drive and Outlook appear on both platforms (strong on iOS and also top-rated on Android), Android shows a stronger focus on utility apps like antivirus and file management, which are less prevalent on iOS.
In general, Android users seem to prefer more utility and tools-oriented apps (e.g., antivirus, file management), while iOS users gravitate toward more premium or specialized services (e.g., Calm, Disney+). Shopping habits differ as well, with Amazon being stronger on iOS and Flipkart being more prominent on Android.
This highlights both the global dominance of certain apps and the subtle platform-specific preferences across various categories.

From this for both Android and iOS platforms, we can make some deductions about the most profitable apps. Also, historically, iOS users tend to spend more on apps and in-app purchases than Android users, while Android dominates in markets with large user bases, offering higher potential for ad revenue.

**Social Media & Communication Apps:**
These apps typically earn revenue through advertising (e.g., Facebook, Instagram), and some might have premium features (e.g., WhatsApp Business or Telegram subscriptions). They have massive user bases and are highly profitable due to advertising. However, apps like Facebook and Instagram are likely the most profitable because they leverage billions of users globally, which makes for a tricky market to find success in.

**Streaming & Entertainment Apps:**
Subscription models drive revenue for streaming apps like Netflix and YouTube Premium, while free versions rely on ad revenue. Streaming apps have recurring subscription revenue, making them highly profitable. However, securing the rights to legally stream IPs, and build a reliable platform might be a lot more complex than other alternatives, considering how monolithic the current streaming platforms are.

**Finance and Payment, and Shopping Apps:**
These apps face an issue similar to streaming and entertainement apps, their barrier of entry might be too high, while they could be highly profitable as well, they could be too complex to set up and market, as well as needing a massive userbase to become profitable.

**Gaming Apps:**
Games are often highly profitable due to in-app purchases, and ads. Successful mobile games like Clash of Clans generate billions through in-app purchases. These games dominate both platforms and have monetization models that include in-app purchases for upgrades, cosmetic items, or lives. It has high potential for profitability, as the userbase is massive and dynamic, but also is highly competitive, so it makes it a high-risk, high-reward category.

**Productivity & Tools Apps:**
Tools and productivity apps tend to generate revenue through freemium models, subscriptions, and one-time purchases. Productivity apps like Google Drive and Microsoft Outlook benefit from enterprise customers, making them strong earners. However, Clean Master may be profitable due to massive Android user adoption and ad revenue. This is a fairly broad category, so it could be a good

**Health & Fitness and Parenting:**
Many health apps use a subscription model or in-app purchases to access premium features like tracking, personal plans, or consultations. This is a smaller niche compared to the other categories, but it has a chance of being a lot more profitable due to that same reason. Parenting apps, especially those focused on early childhood (e.g., pregnancy trackers, baby care apps), have specific audiences with high engagement. 

**Lifestyle:**
These apps are relatively easy to develop and can cater to specific needs or communities. There's also more room to introduce creative or niche ideas. For instance, apps that focus on eco-friendly habits, personal finance, or goal-setting can do well. Freemium or subscription based models could be highly effective for apps in this category.

**Conclusion:**
Health & Fitness, Education, Lifestyle, Tools & Productivity, and Parenting apps seem to offer the best balance of profitability potential and lower competition. These categories allow for niche innovation, clear monetization pathways, and relatively straightforward development without requiring massive user bases to be profitable. Freemium and subscription models tend to be highly effective in these categories, providing recurring revenue with lower barriers to entry compared to competitive sectors like gaming or social media.