In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# Load data
df = pd.read_csv("C:/Users/M Dimas Prayoga/Downloads/playstore/googleplaystore.csv")

# Preprocessing dasar
df = df.dropna(subset=['Rating', 'Size', 'Price', 'Installs', 'Category'])

# Bersihkan kolom Price
df['Price'] = df['Price'].replace('Free', '$0')
df['Price'] = df['Price'].str.replace('$', '', regex=False)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Bersihkan Size
def convert_size(x):
    if x == 'Varies with device':
        return None
    elif 'M' in x:
        return float(x.replace('M', ''))
    elif 'k' in x:
        return float(x.replace('k', '')) / 1024
    return None

df['Size_MB'] = df['Size'].apply(convert_size)

# Bersihkan Installs
df['Installs'] = df['Installs'].str.replace('[+,]', '', regex=True)
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

# Hapus baris kosong
df = df.dropna(subset=['Size_MB', 'Price', 'Installs'])

# ===============================
# ✅ FILTER KATEGORI POPULER
# ===============================
# Ambil hanya kategori dengan ≥ 100 data
kategori_counts = df['Category'].value_counts()
kategori_terpilih = kategori_counts[kategori_counts >= 100].index
df = df[df['Category'].isin(kategori_terpilih)]

# Encode kategori
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])

# Fitur dan target
X = df[['Rating', 'Price', 'Size_MB', 'Installs']]
y = df['Category_encoded']

# Standarisasi
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ====================
# 🔍 Random Forest
# ====================
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print("🎯 Random Forest Results (Filtered)")
print("Akurasi:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds, target_names=le.classes_, zero_division=0))

# ====================
# 🤖 Naive Bayes
# ====================
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_preds = nb.predict(X_test)

print("\n🤖 Naive Bayes Results (Filtered)")
print("Akurasi:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds, target_names=le.classes_, zero_division=0))


🎯 Random Forest Results (Filtered)
Akurasi: 0.26837972876516775
                     precision    recall  f1-score   support

BOOKS_AND_REFERENCE       0.15      0.13      0.14        30
           BUSINESS       0.24      0.18      0.21        55
      COMMUNICATION       0.14      0.11      0.12        36
             DATING       0.54      0.43      0.48        46
          EDUCATION       0.28      0.19      0.22        27
             FAMILY       0.31      0.34      0.33       332
            FINANCE       0.08      0.07      0.08        54
               GAME       0.37      0.38      0.37       197
 HEALTH_AND_FITNESS       0.29      0.30      0.29        46
          LIFESTYLE       0.14      0.11      0.12        54
            MEDICAL       0.32      0.26      0.29        69
 NEWS_AND_MAGAZINES       0.15      0.19      0.17        32
    PERSONALIZATION       0.16      0.20      0.18        45
        PHOTOGRAPHY       0.28      0.31      0.29        42
       PRODUCTIVITY 