In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load dataset
df = pd.read_csv("C:/Users/M Dimas Prayoga/Downloads/playstore/googleplaystore.csv")

# Drop rows with missing or invalid data
df = df.dropna(subset=['App', 'Category', 'Genres', 'Content Rating', 'Rating', 'Size'])

# Bersihkan data numerik
def clean_size(size):
    if 'M' in size:
        return float(size.replace('M', '').replace(',', '').strip())
    elif 'k' in size:
        return float(size.replace('k', '').replace(',', '').strip()) / 1024
    elif size == 'Varies with device':
        return np.nan
    else:
        try:
            return float(size)
        except:
            return np.nan

df['Size'] = df['Size'].apply(clean_size)
df = df.dropna(subset=['Size'])

# Gunakan hanya kolom penting
features = df[['App', 'Category', 'Genres', 'Content Rating', 'Rating', 'Size']].copy()

# Gabungkan fitur kategori menjadi satu string
features['text_features'] = (
    features['Category'] + ' ' +
    features['Genres'] + ' ' +
    features['Content Rating']
)

# TF-IDF untuk fitur text
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(features['text_features'])

# Scaling fitur numerik
scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(features[['Rating', 'Size']])

# Gabungkan semua fitur (sparse + dense)
from scipy.sparse import hstack
combined_features = hstack([tfidf_matrix, numeric_features])

# Hitung similarity
cos_sim = cosine_similarity(combined_features, combined_features)

# Fungsi rekomendasi
def recommend(app_name, top_n=5):
    if app_name not in features['App'].values:
        return f"Aplikasi '{app_name}' tidak ditemukan."
    
    idx = features[features['App'] == app_name].index[0]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # skip self match
    
    recommendations = [features.iloc[i[0]]['App'] for i in sim_scores]
    return recommendations

# Contoh penggunaan:
recommend("Sketch - Draw & Paint", top_n=10)


['Tattoo Name On My Photo Editor',
 'I Creative Idea',
 'Install images with music to make video without Net - 2018',
 'Canva: Poster, banner, card maker & graphic design',
 'AJ Styles HD Wallpapers',
 'HD Mickey Minnie Wallpapers',
 'ibis Paint X',
 'Mandala Coloring Book',
 'Text on Photo - Fonteee',
 'Garden Coloring Book']