# 🧠 Manhwa Recommendation System - Final Version

Notebook ini mencakup seluruh proses pembuatan sistem rekomendasi manhwa dengan dua fitur utama:
1. Rekomendasi berdasarkan judul
2. Rekomendasi berdasarkan keyword bebas

Selain itu, notebook ini juga menggabungkan cover manhwa dari API AniList dan menyimpan hasilnya ke berbagai format (pickle dan Excel).

In [1]:
import pandas as pd
import requests
import numpy as np
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
import gzip
from googletrans import Translator

# Download stopwords for English
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Dataset

In [2]:
# Load Dataset
df = pd.read_excel('Manhwa_Dataset.xlsx')

# Preprocessing Function

In [3]:
# Preprocessing Function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply to necessary columns
df['clean_synopsis'] = df['synopsis'].apply(clean_text)
df['clean_genres'] = df['genres'].apply(clean_text)
df['clean_authors'] = df['authors'].apply(clean_text)

# Combine Tags
df['tags'] = df['clean_synopsis'] + ". " + df['clean_genres'] + ". " + df['clean_authors']

## 📐 TF-IDF Vectorizer & Similarity Calculation

In [4]:
# Untuk rekomendasi by judul
vectorizer_title = TfidfVectorizer()
tag_matrix = vectorizer_title.fit_transform(df['tags'])
similarity = cosine_similarity(tag_matrix)

# Untuk rekomendasi by keyword
tag_vectorizer = TfidfVectorizer()
tag_vectors = tag_vectorizer.fit_transform(df['tags'])

## 📥 Load Cover from API AniList

In [5]:
def get_cover_url(title, cache={}):
    if title in cache:
        return cache[title]
    url = 'https://graphql.anilist.co'
    query = '''
    query ($search: String) {
      Media(search: $search, type: MANGA) {
        coverImage {
          large
        }
      }
    }
    '''
    variables = {'search': title}
    try:
        response = requests.post(url, json={'query': query, 'variables': variables})
        response.raise_for_status()
        data = response.json()
        cover = data['data']['Media']['coverImage']['large']
        cache[title] = cover
        return cover
    except:
        return 'https://via.placeholder.com/150?text=No+Image'

df['cover_url'] = df['title'].apply(get_cover_url)


## 💾 Simpan Dataset dan Model

In [6]:
# Save compressed pickle
with gzip.open('manhwa_dict_with_cover.pkl.gz', 'wb') as f:
    pickle.dump(df[['no', 'title', 'synopsis', 'genres', 'authors', 'tags', 'cover_url']].to_dict(), f)

with gzip.open('similarity.pkl.gz', 'wb') as f:
    pickle.dump(similarity, f)

with gzip.open('tag_vectorizer.pkl.gz', 'wb') as f:
    pickle.dump(tag_vectorizer, f)

with gzip.open('tag_vectors.pkl.gz', 'wb') as f:
    pickle.dump(tag_vectors, f)

print("✅ Semua file berhasil disimpan!")

✅ Semua file berhasil disimpan!


## Fitur Terjemahan

In [7]:
translator = Translator()

def translate_to_english(text):
    translated = translator.translate(text, src='id', dest='en')
    return translated.text

# Fungsi Rekomendasi Berdasarkan Judul

In [8]:
def recommend_by_title(selected_title):
    idx = df[df['title'] == selected_title].index[0]
    distances = similarity[idx]
    manhwa_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    results = []
    for i in manhwa_list:
        row = df.iloc[i[0]]
        results.append((row['title'], row['cover_url']))
    return results

# Fungsi Rekomendasi Berdasarkan Keyword


In [9]:
def recommend_by_keyword(user_input):
    # Translate user input to English if necessary
    if is_indonesian(user_input):
        user_input = translate_to_english(user_input)
    
    user_vec = tag_vectorizer.transform([user_input])
    scores = cosine_similarity(user_vec, tag_vectors).flatten()
    top_indices = scores.argsort()[::-1][:5]
    
    results = []