In [14]:
from datasets import load_dataset
import pandas as pd

# Load the US App Store metadata
ds = load_dataset("MacPaw/mac-app-store-apps-metadata", "metadata_US")

# Convert to pandas DataFrame
df = ds['train'].to_pandas()

In [15]:
df['releaseDate'] = pd.to_datetime(df['releaseDate'], errors='coerce', utc=True)
df['currentVersionReleaseDate'] = pd.to_datetime(df['currentVersionReleaseDate'], errors='coerce', utc=True)

In [16]:
df['isGameCenterEnabled'] = df['isGameCenterEnabled'].astype(bool)
df['isVppDeviceBasedLicensingEnabled'] = df['isVppDeviceBasedLicensingEnabled'].astype(bool)

In [17]:
rating_cols = [
    'averageUserRating', 'userRatingCount',
    'averageUserRatingForCurrentVersion', 'userRatingCountForCurrentVersion',
    'price', 'fileSizeBytes'
]
for col in rating_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

In [18]:
cat_cols = ['primaryGenreName', 'contentAdvisoryRating', 'trackContentRating', 'currency']
for col in cat_cols:
    df[col] = df[col].astype('category')

In [19]:
df['description'] = df['description'].fillna('').astype(str)
df['releaseNotes'] = df['releaseNotes'].fillna('').astype(str)

df['formattedPrice'] = df['price'].apply(lambda x: f"${x:.2f}" if x > 0 else "Free")


In [20]:
df = df.convert_dtypes()

In [22]:
overview_df = df[[
    'trackId', 'trackName', 'primaryGenreName', 'averageUserRating','formattedPrice',
    'contentAdvisoryRating', 'userRatingCount'
]]
overview_df.to_parquet('C:/Users/nandi/OneDrive/Desktop/Profitable-App-Profiles-for-Mobile-Market/data/overview.parquet', index=False)

In [23]:
df.columns

Index(['id', 'supportedDevices', 'features', 'advisories',
       'isGameCenterEnabled', 'screenshotUrls', 'ipadScreenshotUrls',
       'appletvScreenshotUrls', 'artworkUrl60', 'artworkUrl512',
       'artworkUrl100', 'artistViewUrl', 'kind', 'minimumOsVersion',
       'releaseNotes', 'artistId', 'artistName', 'genres', 'price',
       'description', 'trackId', 'trackName', 'bundleId', 'primaryGenreName',
       'primaryGenreId', 'genreIds', 'releaseDate',
       'isVppDeviceBasedLicensingEnabled', 'sellerName',
       'currentVersionReleaseDate', 'currency', 'trackCensoredName',
       'languageCodesISO2A', 'fileSizeBytes', 'sellerUrl', 'formattedPrice',
       'contentAdvisoryRating', 'averageUserRatingForCurrentVersion',
       'userRatingCountForCurrentVersion', 'averageUserRating', 'trackViewUrl',
       'trackContentRating', 'version', 'wrapperType', 'userRatingCount'],
      dtype='object')

In [24]:
top_apps_df = df[[
    'trackId', 'trackName', 'averageUserRating', 'userRatingCount','formattedPrice',
     'price', 'trackViewUrl', 'contentAdvisoryRating','primaryGenreName'
]]
top_apps_df.to_parquet('C:/Users/nandi/OneDrive/Desktop/Profitable-App-Profiles-for-Mobile-Market/data/top_apps.parquet', index=False)

In [25]:

df = pd.read_parquet('C:/Users/nandi/OneDrive/Desktop/Profitable-App-Profiles-for-Mobile-Market/data/top_apps.parquet')

# Filter for high-rated apps with strong engagement
top = df[
    (df['averageUserRating'] >= 4.5) &
    (df['userRatingCount'] > 1000)  # Filter out low-volume ratings
].sort_values(by='userRatingCount', ascending=False).head(10)

# Add monetization signal
top['is_paid'] = top['price'].apply(lambda x: x != 'Free')

# Display key metrics
print(top[['trackName', 'averageUserRating', 'userRatingCount', 'price', 'is_paid']])

                         trackName  averageUserRating  userRatingCount  price  \
2649            WhatsApp Messenger            4.69163         12894602    0.0   
2652                 Amazon Kindle            4.85231          4426958    0.0   
2650            Amazon Prime Video            4.78128          2977157    0.0   
207   Canva: Design, Photo & Video            4.88332          1818830    0.0   
2661    DuckDuckGo Private Browser            4.85703          1667157    0.0   
905                     TestFlight            4.74403           701021    0.0   
2665  Zappos: Shop shoes & clothes            4.92001           672567    0.0   
442          CapCut - Video Editor            4.76617           639400    0.0   
2659    NordVPN: VPN Fast & Secure            4.65495           451379    0.0   
2662                 Color Widgets            4.61278           419845    0.0   

      is_paid  
2649     True  
2652     True  
2650     True  
207      True  
2661     True  
905      Tru

In [27]:
df.head()

Unnamed: 0,trackId,trackName,averageUserRating,userRatingCount,formattedPrice,price,trackViewUrl,contentAdvisoryRating,primaryGenreName
0,1312926037,stoic. journal & mental health,4.84368,20223,Free,0.0,https://apps.apple.com/us/app/stoic-journal-me...,4+,Health & Fitness
1,922744883,SmartGym: Gym & Home Workouts,4.65102,13826,Free,0.0,https://apps.apple.com/us/app/smartgym-gym-hom...,4+,Health & Fitness
2,1551848949,Migiri: Porn Blocker & More,4.45611,1333,Free,0.0,https://apps.apple.com/us/app/migiri-porn-bloc...,12+,Health & Fitness
3,672401817,Strides: Goal Tracker,4.78518,15762,Free,0.0,https://apps.apple.com/us/app/strides-goal-tra...,4+,Productivity
4,1491286709,Ultrahuman,4.30924,595,Free,0.0,https://apps.apple.com/us/app/ultrahuman/id149...,4+,Health & Fitness


In [26]:

explorer_df = df[[
    'trackId', 'trackName', 'averageUserRating', 'userRatingCount',
     'contentAdvisoryRating', 'trackViewUrl','formattedPrice'
]]
explorer_df.to_parquet('C:/Users/nandi/OneDrive/Desktop/Profitable-App-Profiles-for-Mobile-Market/data/explorer.parquet', index=False)