# Eksplorasi Data untuk Sistem Rekomendasi Proyek Web3

Notebook ini digunakan untuk mengeksplorasi dan menganalisis data proyek Web3 dari CoinGecko API yang akan digunakan untuk sistem rekomendasi. Kita akan melakukan:

1. Pengambilan dan loading data
2. Eksplorasi dan visualisasi data
3. Analisis untuk feature engineering
4. Pembuatan user-item matrix
5. Simulasi rekomendasi

Mari kita mulai!

In [None]:
# Import libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# Add root directory to path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import project modules
from src.collectors.coingecko_collector import CoinGeckoCollector
from src.processors.data_processor import DataProcessor
from src.models.matrix_builder import MatrixBuilder
from src.models.collaborative_filtering import CollaborativeFiltering
from src.models.feature_enhanced_cf import FeatureEnhancedCF
from src.utils.data_utils import load_csv_data, load_json_data, convert_json_columns
from config.config import RAW_DATA_PATH, PROCESSED_DATA_PATH

## 1. Loading Data

Mari kita load data proyek Web3 yang telah dikumpulkan.

In [None]:
# Fungsi untuk menemukan file terbaru berdasarkan awalan
def find_latest_file(directory, prefix, suffix='.csv'):
    files = [f for f in os.listdir(directory) if f.startswith(prefix) and f.endswith(suffix)]
    if not files:
        return None
    return os.path.join(directory, max(files))

# Load data proyek terbaru
projects_file = find_latest_file(PROCESSED_DATA_PATH, 'processed_projects_')
if projects_file:
    projects_df = pd.read_csv(projects_file)
    print(f"Loaded {len(projects_df)} projects from {os.path.basename(projects_file)}")
else:
    print("No processed projects file found. Loading directly from processor...")
    processor = DataProcessor()
    projects_df, _, _ = processor.load_latest_processed_data()
    if projects_df is None:
        print("Error: Projects data not available. Please run data collection first.")
    else:
        print(f"Loaded {len(projects_df)} projects from processor")

# Convert JSON columns
json_columns = ['platforms', 'categories']
projects_df = convert_json_columns(projects_df, json_columns)

# Display project data sample
projects_df.head()

In [None]:
# Load synthetic user interactions data
interactions_file = find_latest_file(PROCESSED_DATA_PATH, 'user_interactions_')
if interactions_file:
    interactions_df = pd.read_csv(interactions_file)
    print(f"Loaded {len(interactions_df)} user interactions from {os.path.basename(interactions_file)}")
else:
    print("No user interactions file found. Loading directly from processor...")
    processor = DataProcessor()
    _, interactions_df, _ = processor.load_latest_processed_data()
    if interactions_df is None:
        print("Error: Interactions data not available.")
    else:
        print(f"Loaded {len(interactions_df)} interactions from processor")

# Display interactions data sample
if interactions_df is not None:
    interactions_df.head()

## 2. Eksplorasi dan Visualisasi Data

Mari kita eksplorasi data untuk memahami karakteristiknya.

In [None]:
# Tampilkan informasi statistik
print("=== Project Data Statistics ===")
projects_df.info()

# Tampilkan statistik deskriptif untuk kolom numerik
print("\n=== Descriptive Statistics for Numerical Features ===")
projects_df.describe()

In [None]:
# Visualisasi distribusi market cap
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(projects_df['market_cap'].fillna(0), bins=50, kde=True)
plt.title('Market Cap Distribution')
plt.xlabel('Market Cap')
plt.xscale('log')

plt.subplot(1, 2, 2)
sns.boxplot(y=projects_df['market_cap'].fillna(0))
plt.title('Market Cap Boxplot')
plt.ylabel('Market Cap')
plt.yscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Visualisasi distribusi volume perdagangan
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(projects_df['total_volume'].fillna(0), bins=50, kde=True)
plt.title('Trading Volume Distribution')
plt.xlabel('Trading Volume')
plt.xscale('log')

plt.subplot(1, 2, 2)
sns.boxplot(y=projects_df['total_volume'].fillna(0))
plt.title('Trading Volume Boxplot')
plt.ylabel('Trading Volume')
plt.yscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Visualisasi distribusi perubahan harga
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.histplot(projects_df['price_change_percentage_24h'].dropna(), bins=50, kde=True)
plt.title('24h Price Change Distribution')
plt.xlabel('Price Change (%)')

if 'price_change_percentage_7d_in_currency' in projects_df.columns:
    plt.subplot(1, 3, 2)
    sns.histplot(projects_df['price_change_percentage_7d_in_currency'].dropna(), bins=50, kde=True)
    plt.title('7d Price Change Distribution')
    plt.xlabel('Price Change (%)')

if 'price_change_percentage_30d_in_currency' in projects_df.columns:
    plt.subplot(1, 3, 3)
    sns.histplot(projects_df['price_change_percentage_30d_in_currency'].dropna(), bins=50, kde=True)
    plt.title('30d Price Change Distribution')
    plt.xlabel('Price Change (%)')

plt.tight_layout()
plt.show()

In [None]:
# Visualisasi distribusi kategori
primary_category_counts = projects_df['primary_category'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=primary_category_counts.index, y=primary_category_counts.values)
plt.title('Distribution of Primary Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Visualisasi distribusi blockchain
chain_counts = projects_df['chain'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=chain_counts.index, y=chain_counts.values)
plt.title('Distribution of Blockchains')
plt.xlabel('Blockchain')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Visualisasi korelasi antara metrik
# Pilih kolom numerik
numeric_cols = ['market_cap', 'total_volume', 'price_change_percentage_24h', 
                'popularity_score', 'trend_score']

# Tambahkan kolom lain jika tersedia
if 'price_change_percentage_7d_in_currency' in projects_df.columns:
    numeric_cols.append('price_change_percentage_7d_in_currency')
if 'reddit_subscribers' in projects_df.columns:
    numeric_cols.append('reddit_subscribers')
if 'twitter_followers' in projects_df.columns:
    numeric_cols.append('twitter_followers')
if 'github_stars' in projects_df.columns:
    numeric_cols.append('github_stars')

# Hitung korelasi
corr = projects_df[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

## 3. Analisis untuk Feature Engineering

Mari kita analisis fitur-fitur yang mempengaruhi popularitas dan tren investasi proyek Web3.

In [None]:
# Visualisasi hubungan antara market cap dan volume
plt.figure(figsize=(10, 6))
sns.scatterplot(x='market_cap', y='total_volume', hue='primary_category', 
                data=projects_df, alpha=0.7, palette='viridis')
plt.title('Market Cap vs Trading Volume by Category')
plt.xlabel('Market Cap')
plt.ylabel('Trading Volume')
plt.xscale('log')
plt.yscale('log')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Analisis faktor yang mempengaruhi popularity score
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.regplot(x='market_cap', y='popularity_score', data=projects_df, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
plt.title('Market Cap vs Popularity Score')
plt.xlabel('Market Cap')
plt.ylabel('Popularity Score')
plt.xscale('log')

plt.subplot(1, 3, 2)
sns.regplot(x='total_volume', y='popularity_score', data=projects_df, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
plt.title('Trading Volume vs Popularity Score')
plt.xlabel('Trading Volume')
plt.ylabel('Popularity Score')
plt.xscale('log')

if 'twitter_followers' in projects_df.columns:
    plt.subplot(1, 3, 3)
    sns.regplot(x='twitter_followers', y='popularity_score', data=projects_df, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
    plt.title('Twitter Followers vs Popularity Score')
    plt.xlabel('Twitter Followers')
    plt.ylabel('Popularity Score')
    plt.xscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Analisis faktor yang mempengaruhi trend score
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.regplot(x='price_change_percentage_24h', y='trend_score', data=projects_df, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
plt.title('24h Price Change vs Trend Score')
plt.xlabel('24h Price Change (%)')
plt.ylabel('Trend Score')

if 'price_change_percentage_7d_in_currency' in projects_df.columns:
    plt.subplot(1, 3, 2)
    sns.regplot(x='price_change_percentage_7d_in_currency', y='trend_score', data=projects_df, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
    plt.title('7d Price Change vs Trend Score')
    plt.xlabel('7d Price Change (%)')
    plt.ylabel('Trend Score')

plt.subplot(1, 3, 3)
sns.boxplot(x='primary_category', y='trend_score', data=projects_df)
plt.title('Trend Score by Category')
plt.xlabel('Category')
plt.ylabel('Trend Score')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Analisis popularitas berdasarkan kategori dan chain
plt.figure(figsize=(15, 10))

plt.subplot(2, 1, 1)
category_popularity = projects_df.groupby('primary_category')['popularity_score'].mean().sort_values(ascending=False)
sns.barplot(x=category_popularity.index, y=category_popularity.values)
plt.title('Average Popularity Score by Category')
plt.xlabel('Category')
plt.ylabel('Average Popularity Score')
plt.xticks(rotation=45, ha='right')

plt.subplot(2, 1, 2)
chain_popularity = projects_df.groupby('chain')['popularity_score'].mean().sort_values(ascending=False).head(10)
sns.barplot(x=chain_popularity.index, y=chain_popularity.values)
plt.title('Average Popularity Score by Blockchain (Top 10)')
plt.xlabel('Blockchain')
plt.ylabel('Average Popularity Score')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Analisis social metrics (jika tersedia)
social_cols = ['reddit_subscribers', 'twitter_followers', 'github_stars']
available_social_cols = [col for col in social_cols if col in projects_df.columns]

if available_social_cols:
    plt.figure(figsize=(15, 5 * len(available_social_cols)))
    
    for i, col in enumerate(available_social_cols, 1):
        # Top projects by social metric
        top_social = projects_df.sort_values(col, ascending=False).head(10)[['name', 'symbol', col]]
        
        plt.subplot(len(available_social_cols), 1, i)
        sns.barplot(x=col, y='name', data=top_social)
        plt.title(f'Top 10 Projects by {col}')
        plt.xlabel(col)
        plt.ylabel('Project')
    
    plt.tight_layout()
    plt.show()

## 4. Pembuatan User-Item Matrix

Mari kita buat dan visualisasikan user-item matrix dari data interaksi.

In [None]:
# Buat user-item matrix jika interactions_df tersedia
if interactions_df is not None:
    # Buat matrix builder
    matrix_builder = MatrixBuilder()
    
    # Build user-item matrix
    user_item_df, user_indices, item_indices = matrix_builder.build_user_item_matrix(interactions_df)
    
    print(f"User-item matrix shape: {user_item_df.shape}")
    
    # Tampilkan sampel matrix
    print("\nSample of user-item matrix:")
    display(user_item_df.iloc[:5, :5])
    
    # Visualisasi distribusi rating
    plt.figure(figsize=(10, 6))
    
    # Flatten matrix dan hilangkan 0
    ratings = user_item_df.values.flatten()
    ratings = ratings[ratings > 0]
    
    sns.histplot(ratings, bins=5, kde=True)
    plt.title('Distribution of User Interaction Weights')
    plt.xlabel('Weight')
    plt.ylabel('Frequency')
    plt.xticks(range(1, 6))
    plt.show()
    
    # Hitung dan visualisasi sparsity
    total_cells = user_item_df.shape[0] * user_item_df.shape[1]
    filled_cells = (user_item_df > 0).sum().sum()
    sparsity = 1 - (filled_cells / total_cells)
    
    print(f"\nMatrix sparsity: {sparsity:.4f} ({filled_cells} filled cells out of {total_cells} total cells)")
    
    plt.figure(figsize=(8, 4))
    plt.bar(['Filled Cells', 'Empty Cells'], [filled_cells, total_cells - filled_cells])
    plt.title('User-Item Matrix Sparsity')
    plt.ylabel('Number of Cells')
    plt.show()
else:
    print("User interactions data not available. Cannot build user-item matrix.")

In [None]:
# Build item similarity matrix jika user_item_df tersedia
if 'user_item_df' in locals():
    # Build similarity matrix
    item_similarity_df = matrix_builder.build_item_similarity_matrix(user_item_df)
    
    print(f"Item similarity matrix shape: {item_similarity_df.shape}")
    
    # Tampilkan sampel matrix
    print("\nSample of item similarity matrix:")
    display(item_similarity_df.iloc[:5, :5])
    
    # Visualisasi distribusi similarity
    plt.figure(figsize=(10, 6))
    
    # Flatten matrix dan hilangkan diagonal (self-similarity)
    similarities = item_similarity_df.values.flatten()
    similarities = similarities[similarities < 1]  # Remove self-similarity (1)
    
    sns.histplot(similarities, bins=50, kde=True)
    plt.title('Distribution of Item Similarities')
    plt.xlabel('Similarity')
    plt.ylabel('Frequency')
    plt.show()

## 5. Simulasi Rekomendasi

Mari kita simulasikan rekomendasi untuk beberapa user contoh.

In [None]:
# Inisialisasi model rekomendasi
cf = CollaborativeFiltering()
feature_cf = FeatureEnhancedCF()

In [None]:
# Fungsi helper untuk menampilkan rekomendasi
def display_recommendations(recommendations, title):
    print(f"\n{title}:")
    print("-" * len(title))
    
    for i, rec in enumerate(recommendations[:10], 1):
        if isinstance(rec, dict):
            name = rec.get('name', rec.get('id', 'Unknown'))
            symbol = rec.get('symbol', '')
            score = rec.get('recommendation_score', rec.get('similarity_score', 0))
            category = rec.get('primary_category', '')
            chain = rec.get('chain', '')
            
            print(f"{i}. {name} ({symbol}) - Score: {score:.4f}")
            print(f"   Category: {category}, Chain: {chain}")
        else:
            print(f"{i}. {rec[0]} - Score: {rec[1]:.4f}")

In [None]:
# Simulasi rekomendasi jika semua data tersedia
if 'user_item_df' in locals() and 'projects_df' in locals():
    # Pilih user contoh
    sample_users = user_item_df.index[:3].tolist() if len(user_item_df) >= 3 else user_item_df.index.tolist()
    
    for user_id in sample_users:
        print(f"\n=== Recommendations for User: {user_id} ===")
        
        # Dapatkan item yang sudah diinteraksi oleh user
        user_ratings = user_item_df.loc[user_id]
        rated_items = user_ratings[user_ratings > 0]
        
        print(f"User has interacted with {len(rated_items)} items")
        print("\nTop 5 items user has interacted with:")
        for item_id, rating in rated_items.sort_values(ascending=False).head(5).items():
            item_data = projects_df[projects_df['id'] == item_id]
            if not item_data.empty:
                name = item_data.iloc[0]['name']
                symbol = item_data.iloc[0]['symbol']
                print(f"- {name} ({symbol}) - Interaction weight: {rating}")
        
        # Generate different types of recommendations
        user_based_recs = cf.user_based_cf(user_id, n=10)
        item_based_recs = cf.item_based_cf(user_id, n=10)
        feature_recs = feature_cf.recommend_projects(user_id, user_item_df, item_similarity_df, projects_df, n=10)
        hybrid_recs = cf.hybrid_recommendations(user_id, n=10)
        
        # Display recommendations
        display_recommendations([
            {"id": item, "score": score, **projects_df[projects_df['id'] == item].iloc[0].to_dict()}
            for item, score in user_based_recs
            if not projects_df[projects_df['id'] == item].empty
        ], "User-Based CF Recommendations")
        
        display_recommendations([
            {"id": item, "score": score, **projects_df[projects_df['id'] == item].iloc[0].to_dict()}
            for item, score in item_based_recs
            if not projects_df[projects_df['id'] == item].empty
        ], "Item-Based CF Recommendations")
        
        display_recommendations(feature_recs, "Feature-Enhanced CF Recommendations")
        
        display_recommendations(hybrid_recs, "Hybrid Recommendations")
else:
    print("Required data not available. Cannot simulate recommendations.")

In [None]:
# Get trending and popular projects
if 'projects_df' in locals():
    trending_projects = cf.get_trending_projects(n=10)
    popular_projects = cf.get_popular_projects(n=10)
    
    display_recommendations(trending_projects, "Trending Projects")
    display_recommendations(popular_projects, "Popular Projects")

In [None]:
# Visualisasi perbandingan rekomendasi untuk satu user
if 'user_item_df' in locals() and 'projects_df' in locals() and len(sample_users) > 0:
    # Pilih user pertama
    user_id = sample_users[0]
    
    # Dapatkan rekomendasi
    user_based_recs = [item for item, _ in cf.user_based_cf(user_id, n=10)]
    item_based_recs = [item for item, _ in cf.item_based_cf(user_id, n=10)]
    feature_recs = [rec['id'] for rec in feature_cf.recommend_projects(user_id, user_item_df, item_similarity_df, projects_df, n=10)]
    hybrid_recs = [rec['id'] for rec in cf.hybrid_recommendations(user_id, n=10)]
    
    # Hitung overlap
    user_item_overlap = len(set(user_based_recs) & set(item_based_recs))
    user_feature_overlap = len(set(user_based_recs) & set(feature_recs))
    user_hybrid_overlap = len(set(user_based_recs) & set(hybrid_recs))
    item_feature_overlap = len(set(item_based_recs) & set(feature_recs))
    item_hybrid_overlap = len(set(item_based_recs) & set(hybrid_recs))
    feature_hybrid_overlap = len(set(feature_recs) & set(hybrid_recs))
    
    # Visualisasi overlap
    plt.figure(figsize=(10, 6))
    overlaps = [
        user_item_overlap, user_feature_overlap, user_hybrid_overlap,
        item_feature_overlap, item_hybrid_overlap, feature_hybrid_overlap
    ]
    labels = [
        'User-CF & Item-CF', 'User-CF & Feature-CF', 'User-CF & Hybrid',
        'Item-CF & Feature-CF', 'Item-CF & Hybrid', 'Feature-CF & Hybrid'
    ]
    
    sns.barplot(x=labels, y=overlaps)
    plt.title(f'Recommendation Overlap for User {user_id}')
    plt.xlabel('Recommendation Methods')
    plt.ylabel('Number of Common Recommendations')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Visualisasi diversity berdasarkan kategori
    plt.figure(figsize=(15, 10))
    
    for i, (recs, method) in enumerate([
        (user_based_recs, 'User-Based CF'),
        (item_based_recs, 'Item-Based CF'),
        (feature_recs, 'Feature-Enhanced CF'),
        (hybrid_recs, 'Hybrid')
    ], 1):
        # Dapatkan kategori untuk setiap item rekomendasi
        rec_categories = []
        for item in recs:
            item_data = projects_df[projects_df['id'] == item]
            if not item_data.empty:
                rec_categories.append(item_data.iloc[0]['primary_category'])
        
        plt.subplot(2, 2, i)
        if rec_categories:
            category_counts = pd.Series(rec_categories).value_counts()
            sns.barplot(x=category_counts.index, y=category_counts.values)
            plt.title(f'Category Distribution in {method} Recommendations')
            plt.xlabel('Category')
            plt.ylabel('Count')
            plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()

## Kesimpulan

Berdasarkan eksplorasi dan simulasi yang telah dilakukan, kita dapat menyimpulkan beberapa hal penting:

1. **Karakteristik Data**:
   - Distribusi market cap dan volume perdagangan sangat skewed, dengan beberapa proyek yang sangat dominan
   - Perubahan harga menunjukkan volatilitas tinggi, khas pasar crypto
   - Kategori dan blockchain memiliki perbedaan popularitas yang signifikan

2. **Feature Engineering**:
   - Market cap, volume, dan metrik sosial merupakan indikator kuat dari popularitas
   - Perubahan harga jangka pendek dan menengah mempengaruhi tren
   - Kategori tertentu (seperti DeFi, NFT) memiliki karakteristik tren yang berbeda

3. **Rekomendasi**:
   - Sistem rekomendasi berbasis feature-enhanced CF dan hybrid memberikan hasil yang lebih beragam
   - Ada overlap yang signifikan antara metode rekomendasi berbeda, menunjukkan konsistensi
   - Metode hybrid menghasilkan rekomendasi yang lebih seimbang dalam hal kategori

Untuk pengembangan lebih lanjut, fokus dapat diberikan pada:
- Pengumpulan data interaksi nyata dari pengguna
- Penambahan fitur berbasis konten seperti deskripsi proyek dan analisis sentimen
- Implementasi evaluasi online dengan A/B testing
- Penyesuaian pembobotan dalam metode hybrid untuk meningkatkan akurasi