In [2]:
!pip install pandas




[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\nandi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


This cell reads a JSON file containing multiple app records from the Apple App Store, where each line represents a separate JSON object. It parses each line into a Python dictionary and stores them in a list. The top-level metadata and the nested app details (under the `"item"` key) are then flattened into two separate DataFrames. These are merged side by side to create a unified structure containing both general metadata and detailed app information. A `"platform"` column is added to label the dataset as originating from iOS.


In [3]:
import json
import pandas as pd

# Load each JSON object from the file
data = []
with open('C:/Users/nandi/OneDrive/Desktop/App-Ratings/data/apple_app.json', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

# Each record is a full app object with 'item' nested inside
# Extract top-level metadata (e.g., app_id, location, etc.)
meta_df = pd.json_normalize(data)

# Extract nested 'item' fields from each record
item_df = pd.json_normalize([record['item'] for record in data])

# Combine both into one DataFrame
apple_df = pd.concat([meta_df, item_df], axis=1)

# Add platform label
apple_df['platform'] = 'iOS'

# Preview structure
print(apple_df.columns)
print(apple_df.head())

Index(['app_id', 'location', 'language', 'check_url', 'time_update',
       'se_domain', 'item.categories', 'item.languages', 'item.version',
       'item.minimum_os_version', 'item.released_date',
       'item.last_update_date', 'item.update_notes', 'item.images',
       'item.advisories', 'item.type', 'item.rank_group', 'item.rank_absolute',
       'item.position', 'item.app_id', 'item.title', 'item.url', 'item.icon',
       'item.description', 'item.reviews_count', 'item.rating.rating_type',
       'item.rating.value', 'item.rating.votes_count',
       'item.rating.rating_max', 'item.price.current', 'item.price.regular',
       'item.price.max_value', 'item.price.currency',
       'item.price.is_price_range', 'item.price.displayed_price',
       'item.is_free', 'item.main_category', 'item.similar_apps',
       'item.more_apps_by_developer', 'item.size', 'item.developer',
       'item.developer_id', 'item.developer_url', 'categories', 'languages',
       'version', 'minimum_os_versio

In [4]:
import re
import numpy as np

# Clean and normalize description text
def clean_text(text):
    text = re.sub(r'\W+', ' ', str(text).lower())  # Remove punctuation and lowercase
    return text.strip()



In [5]:
apple_df['clean_description'] = apple_df['description'].apply(clean_text)

# Convert release and update dates to datetime
apple_df['released_date'] = pd.to_datetime(apple_df['released_date'], errors='coerce')
apple_df['last_update_date'] = pd.to_datetime(apple_df['last_update_date'], errors='coerce')

# Create time-based features
apple_df['days_since_release'] = (pd.Timestamp('now') - apple_df['released_date']).dt.days
apple_df['days_since_update'] = (pd.Timestamp('now') - apple_df['last_update_date']).dt.days

# Ensure rating is numeric and handle missing values
apple_df['rating_value'] = pd.to_numeric(apple_df['rating.value'], errors='coerce')
apple_df['rating_value'] = apple_df['rating_value'].fillna(0)

# Optional: Convert size from bytes to MB if it's numeric
apple_df['size_MB'] = pd.to_numeric(apple_df['size'], errors='coerce') / (1024 * 1024)

# Preview cleaned data
apple_df[['title', 'clean_description', 'rating_value', 'days_since_release', 'days_since_update', 'size_MB']].head()

Unnamed: 0,title,clean_description,rating_value,days_since_release,days_since_update,size_MB
0,RFBenchmark,now thanks to rfbenchmark one can make informe...,3.3,3692,1977,18.173828
1,Police Cars Transporter Truck – Cargo Simulator,enjoy ultimate police force vehicles transport...,3.7,2995,2994,181.509766
2,Coral Casino Beach & Cabana,welcome to the coral casino beach cabana club ...,0.0,595,210,144.901367
3,乐借借款-现金贷款分期借钱小额信用借款平台,产品介绍 乐借借款致力于为用户提供方便 快捷 安心的创新金融服务 努力打造成为一流的互联网小...,0.0,198,190,17.729492
4,Mijn Wateralarm,met de app mijnwateralarm ontvangt u actuele g...,0.0,2803,2802,26.716797


In [6]:
!pip install scikit-learn




[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\nandi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize vectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

# Fit and transform the clean descriptions
tfidf_matrix = vectorizer.fit_transform(apple_df['clean_description'])

# Get feature names (keywords)
keywords = vectorizer.get_feature_names_out()

# Create a DataFrame of TF-IDF scores
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=keywords)

In [8]:
# Function to extract top N keywords per app
def extract_top_keywords(row_vector, feature_names, top_n=5):
    sorted_indices = row_vector.argsort()[::-1][:top_n]
    return [feature_names[i] for i in sorted_indices]

apple_df['top_keywords'] = [
    extract_top_keywords(row, keywords) for row in tfidf_matrix.toarray()
]

In [9]:
# Create a small labeled sample
labeled_df = apple_df[['title', 'clean_description']].copy()

# Manually assign genres for training (you can refine this later)
labeled_df['genre'] = labeled_df['clean_description'].apply(lambda x: 
    'Utilities' if 'network' in x or 'benchmark' in x else
    'Finance' if 'finance' in x or 'money' in x else
    'Photography' if 'photo' in x or 'camera' in x else
    'Gaming' if 'game' in x else
    'Health & Fitness' if 'health' in x or 'fitness' in x else
    'Other'
)

# Filter out 'Other' to improve training quality
train_df = labeled_df[labeled_df['genre'] != 'Other']

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    train_df['clean_description'], train_df['genre'], test_size=0.2, random_state=42
)

# Build pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
    ('clf', MultinomialNB())
])

# Train model
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                  precision    recall  f1-score   support

         Finance       1.00      0.29      0.44         7
          Gaming       0.62      1.00      0.77        25
Health & Fitness       0.60      0.38      0.46         8
     Photography       0.54      0.54      0.54        13
       Utilities       1.00      0.30      0.46        10

        accuracy                           0.63        63
       macro avg       0.75      0.50      0.54        63
    weighted avg       0.71      0.63      0.60        63



In [11]:
# Apply model to all apps
apple_df['predicted_genre'] = model.predict(apple_df['clean_description'])

In [12]:
# Save cleaned and enriched Apple App Store data
apple_df.to_csv('C:/Users/nandi/OneDrive/Desktop/App-Ratings/data/apple_cleaned.csv', index=False)

In [14]:
apple_df["title"].head()

0                                        RFBenchmark
1    Police Cars Transporter Truck – Cargo Simulator
2                        Coral Casino Beach & Cabana
3                              乐借借款-现金贷款分期借钱小额信用借款平台
4                                    Mijn Wateralarm
Name: title, dtype: object