In [1]:
from datasets import load_dataset

def load_wikiart_dataset():
    dataset = load_dataset("huggan/wikiart", streaming=True, split="train")
    return dataset
dataset = load_wikiart_dataset()
dataset.column_names

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

['image', 'artist', 'genre', 'style']

## Tranformar as colunas em texto

In [2]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pickle
import numpy as np
from collections import defaultdict

def process_streaming_dataset(dataset, max_items=50000):
    """Process streaming dataset and extract features with better data validation"""
    print(f"Processing streaming dataset (max {max_items} items)...")
    
    text_features = []
    metadata = []
    genre_counts = defaultdict(int)
    artist_counts = defaultdict(int)
    style_counts = defaultdict(int)
    
    for i, item in enumerate(dataset):
        if i >= max_items:
            break
            
        if i < 3:
            print(f"Item {i}: {item}")
            

        features = []
 
        artist = None
        style = None
        genre = None

        for field in ['artist', 'Artist', 'ARTIST']:
            if field in item and item[field]:
                artist = str(item[field]).strip()
                break
                
        for field in ['style', 'Style', 'STYLE']:
            if field in item and item[field]:
                style = str(item[field]).strip()
                break
                
        for field in ['genre', 'Genre', 'GENRE']:
            if field in item and item[field]:
                genre = str(item[field]).strip()
                break
        
        # Build features only if we have valid data
        if artist and artist.lower() not in ['none', 'null', '']:
            clean_artist = artist.replace(' ', '_').replace('-', '_')
            features.append(f"artist_{clean_artist}")
            artist_counts[artist] += 1
            
        if style and style.lower() not in ['none', 'null', '']:
            clean_style = style.replace(' ', '_').replace('-', '_')
            features.append(f"style_{clean_style}")
            style_counts[style] += 1
            
        if genre and genre.lower() not in ['none', 'null', '']:
            clean_genre = genre.replace(' ', '_').replace('-', '_')
            features.append(f"genre_{clean_genre}")
            genre_counts[genre] += 1
        
        # Create text feature - use a more descriptive fallback
        if features:
            text_feature = " ".join(features)
        else:
            # Create synthetic features based on item ID for testing
            text_feature = f"artwork_id_{i} category_general"
            
        text_features.append(text_feature)
        
        # Store metadata
        metadata.append({
            'id': i,
            'artist': artist if artist else 'Unknown',
            'style': style if style else 'Unknown',
            'genre': genre if genre else 'Unknown',
            'likes': 0
        })
        
        # Progress indicator - fix duplicate printing
        if (i + 1) % 1000 == 0:
            print(f"Processed {i + 1} items...")
    
    print(f"\nDataset summary:")
    print(f"Total items processed: {len(text_features)}")
    print(f"Unique genres: {len(genre_counts)}")
    print(f"Unique artists: {len(artist_counts)}")
    print(f"Unique styles: {len(style_counts)}")
    
    # Debug: Print sample text features
    print(f"\nSample text features:")
    for i in range(min(5, len(text_features))):
        print(f"  {i}: '{text_features[i]}'")
    
    return text_features, metadata, {
        'genres': dict(genre_counts),
        'artists': dict(artist_counts),
        'styles': dict(style_counts)
    }

In [3]:

def train_streaming_model(max_items=50000):
    """Train model using streaming dataset with relaxed parameters"""
    # Load streaming dataset
    dataset = load_wikiart_dataset()
    
    # Process streaming data
    text_features, metadata, stats = process_streaming_dataset(dataset, max_items)
    
    # Check if we have valid features
    if not text_features or all(f.strip() == "" for f in text_features):
        raise ValueError("No valid text features found in dataset")
    
    
    print("Training TF-IDF vectorizer...")
    vectorizer = TfidfVectorizer(
        max_features=1000,  
        ngram_range=(1, 1),
        min_df=1,
        max_df=0.99,
        lowercase=True,
        token_pattern=r'\b\w+\b'
    )
    
    try:
        tfidf_matrix = vectorizer.fit_transform(text_features)
        print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
        print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
        
        
        test_recommendations_streaming(tfidf_matrix, metadata, test_id=0)
        
        return vectorizer, tfidf_matrix, metadata, stats
        
    except ValueError as e:
        print(f"TF-IDF training failed: {e}")
        print("Creating fallback similarity matrix...")
        
        # Fallback: Create a simple identity matrix for testing
        n_samples = len(text_features)
        fallback_matrix = np.eye(n_samples)
        
        return None, fallback_matrix, metadata, stats

In [4]:
class StreamingRecommender:
    """Recommendation system using pre-trained streaming model"""
    
    def __init__(self, vectorizer=None, tfidf_matrix=None, metadata=None):
        self.vectorizer = vectorizer
        self.tfidf_matrix = tfidf_matrix
        self.metadata = metadata
    
    def get_recommendations(self, artwork_id, user_likes=None, n_recommendations=10):
        """Get recommendations based on artwork similarity and user likes"""
        if self.tfidf_matrix is None:
            return []
        
        # Calculate base similarity
        similarity_scores = cosine_similarity(
            self.tfidf_matrix[artwork_id:artwork_id+1], 
            self.tfidf_matrix
        ).flatten()
        
        # Boost based on user likes
        if user_likes:
            for liked_id in user_likes:
                if liked_id < len(similarity_scores):
                    liked_similarity = cosine_similarity(
                        self.tfidf_matrix[liked_id:liked_id+1],
                        self.tfidf_matrix
                    ).flatten()
                    similarity_scores += 0.3 * liked_similarity
        
        # Get top recommendations
        similar_indices = similarity_scores.argsort()[::-1]
        recommendations = []
        
        for idx in similar_indices:
            if idx != artwork_id and len(recommendations) < n_recommendations:
                artwork_info = self.metadata[idx].copy()
                artwork_info['similarity_score'] = float(similarity_scores[idx])
                recommendations.append(artwork_info)
        
        return recommendations

def test_recommendations_streaming(tfidf_matrix, metadata, test_id=0, n_recs=5):
    """Test recommendations with streaming data"""
    print(f"\nTesting recommendations for artwork ID {test_id}:")
    print(f"Test artwork: {metadata[test_id]}")
    
    # Calculate similarities
    similarities = cosine_similarity(
        tfidf_matrix[test_id:test_id+1], 
        tfidf_matrix
    ).flatten()
    
    # Get top recommendations
    similar_indices = similarities.argsort()[::-1][1:n_recs+1]
    
    print(f"\nTop {n_recs} recommendations:")
    for i, idx in enumerate(similar_indices):
        artwork = metadata[idx]
        print(f"{i+1}. Artist: {artwork['artist']}, Style: {artwork['style']}, "
              f"Genre: {artwork['genre']} (similarity: {similarities[idx]:.3f})")
    
    return similar_indices

# Main execution
if __name__ == "__main__":
    print("Starting streaming dataset training...")
    vectorizer, tfidf_matrix, metadata, stats = train_streaming_model(max_items=10000)
    
    # Create recommender
    recommender = StreamingRecommender(vectorizer, tfidf_matrix, metadata)
    
    # Test with different artworks
    for test_id in [0, 10, 100]:
        recommendations = recommender.get_recommendations(test_id, n_recommendations=3)
        print(f"\nRecommendations for artwork {test_id}:")
        for i, rec in enumerate(recommendations):
            print(f"{i+1}. {rec['artist']} - {rec['style']} (score: {rec['similarity_score']:.3f})")

    # Cell 5: Test recommendation system with user likes
    import random

    def test_recommendations_with_likes():
        """Test how user likes affect recommendations"""
        
        print("=" * 80)
        print("TESTING RECOMMENDATION SYSTEM WITH USER LIKES")
        print("=" * 80)
        
        # Choose a random test artwork
        test_artwork_id = random.randint(0, min(100, len(metadata)-1))
        test_artwork = metadata[test_artwork_id]
        
        print(f"\n🎨 TEST ARTWORK {test_artwork_id}:")
        print(f"   Artist: {test_artwork['artist']}")
        print(f"   Genre: {test_artwork['genre']}")
        print(f"   Style: {test_artwork['style']}")
        
        print(f"\n📊 SCENARIO 1: NO USER LIKES")
        print("-" * 50)
        recommendations_no_likes = recommender.get_recommendations(
            test_artwork_id, 
            user_likes=None, 
            n_recommendations=5
        )
        
        for i, rec in enumerate(recommendations_no_likes):
            print(f"   {i+1}. {rec['artist']} | {rec['genre']} | {rec['style']} "
                  f"(Score: {rec['similarity_score']:.3f})")
        
        # Generate random likes
        random_likes = random.sample(range(0, min(500, len(metadata))), 3)
        print(f"\n💝 SCENARIO 2: WITH USER LIKES {random_likes}")
        print("-" * 50)
        
        # Show what the user "liked"
        print("   User previously liked:")
        for like_id in random_likes:
            liked_artwork = metadata[like_id]
            print(f"     - {liked_artwork['artist']} | {liked_artwork['genre']} | {liked_artwork['style']}")
        
        print("\n   New recommendations:")
        recommendations_with_likes = recommender.get_recommendations(
            test_artwork_id, 
            user_likes=random_likes, 
            n_recommendations=5
        )
        
        for i, rec in enumerate(recommendations_with_likes):
            print(f"   {i+1}. {rec['artist']} | {rec['genre']} | {rec['style']} "
                  f"(Score: {rec['similarity_score']:.3f})")
        
        # Analyze differences
        print(f"\n🔍 ANALYSIS:")
        print("-" * 50)
        
        no_likes_ids = [rec['id'] for rec in recommendations_no_likes]
        with_likes_ids = [rec['id'] for rec in recommendations_with_likes]
        
        common_recs = set(no_likes_ids) & set(with_likes_ids)
        different_recs = len(set(no_likes_ids) ^ set(with_likes_ids))
        
        print(f"   Common recommendations: {len(common_recs)}/5")
        print(f"   Changed recommendations: {different_recs}")
        print(f"   Algorithm impact: {'HIGH' if different_recs >= 3 else 'MEDIUM' if different_recs >= 1 else 'LOW'}")

    # Run the test multiple times
    for test_round in range(3):
        print(f"\n🎯 TEST ROUND {test_round + 1}")
        test_recommendations_with_likes()
        print("\n" + "="*80)

    # Cell 6: Enhanced recommendation system
    class EnhancedRecommendationSystem:
        """Enhanced recommendation system with better like handling"""
        
        def __init__(self, vectorizer, tfidf_matrix, metadata):
            self.vectorizer = vectorizer
            self.tfidf_matrix = tfidf_matrix
            self.metadata = metadata
        
        def get_recommendations(self, artwork_id, user_likes=None, n_recommendations=10, like_boost=0.5):
            """Get recommendations with configurable like boost"""
            if self.tfidf_matrix is None:
                return []
            
            # Calculate base similarity
            base_similarities = cosine_similarity(
                self.tfidf_matrix[artwork_id:artwork_id+1], 
                self.tfidf_matrix
            ).flatten()
            
            # Initialize final scores with base similarities
            final_scores = base_similarities.copy()
            
            # Apply user likes boost
            if user_likes and len(user_likes) > 0:
                print(f"   Applying like boost ({like_boost}) based on {len(user_likes)} liked artworks...")
                
                for liked_id in user_likes:
                    if 0 <= liked_id < len(final_scores):
                        # Calculate similarity to liked artwork
                        liked_similarities = cosine_similarity(
                            self.tfidf_matrix[liked_id:liked_id+1],
                            self.tfidf_matrix
                        ).flatten()
                        
                        # Boost scores based on similarity to liked artwork
                        final_scores += like_boost * liked_similarities
            
            # Get top recommendations (excluding source artwork)
            sorted_indices = final_scores.argsort()[::-1]
            recommendations = []
            
            for idx in sorted_indices:
                if idx != artwork_id and len(recommendations) < n_recommendations:
                    artwork_info = self.metadata[idx].copy()
                    artwork_info['similarity_score'] = float(final_scores[idx])
                    artwork_info['base_similarity'] = float(base_similarities[idx])
                    recommendations.append(artwork_info)
            
            return recommendations
        
        def analyze_recommendation_changes(self, artwork_id, user_likes=None):
            """Analyze how likes change recommendations"""
            
            # Get recommendations without likes
            no_likes_recs = self.get_recommendations(artwork_id, user_likes=None, n_recommendations=10)
            
            # Get recommendations with likes
            with_likes_recs = self.get_recommendations(artwork_id, user_likes=user_likes, n_recommendations=10)
            
            print(f"\n📈 RECOMMENDATION ANALYSIS FOR ARTWORK {artwork_id}")
            print("-" * 60)
            
            print("Without Likes vs With Likes:")
            print(f"{'Rank':<4} {'No Likes':<20} {'With Likes':<20} {'Score Change':<12}")
            print("-" * 60)
            
            for i in range(min(5, len(no_likes_recs), len(with_likes_recs))):
                no_like_rec = no_likes_recs[i]
                with_like_rec = with_likes_recs[i]
                
                score_change = with_like_rec['similarity_score'] - no_like_rec['base_similarity']
                
                print(f"{i+1:<4} {no_like_rec['artist'][:18]:<20} {with_like_rec['artist'][:18]:<20} {score_change:+.3f}")

    # Create enhanced recommender
    enhanced_recommender = EnhancedRecommendationSystem(vectorizer, tfidf_matrix, metadata)

    # Test with specific scenarios
    def test_specific_scenarios():
        """Test specific recommendation scenarios"""
        
        print("🧪 TESTING SPECIFIC SCENARIOS")
        print("=" * 80)
        
        # Scenario 1: Like similar artworks
        test_id = 50
        similar_likes = [51, 52, 53]  # Nearby IDs likely to be similar
        
        print(f"\n📍 SCENARIO: Like similar artworks")
        enhanced_recommender.analyze_recommendation_changes(test_id, similar_likes)
        
        # Scenario 2: Like diverse artworks
        diverse_likes = [10, 200, 500]  # Spread out IDs
        
        print(f"\n📍 SCENARIO: Like diverse artworks")
        enhanced_recommender.analyze_recommendation_changes(test_id, diverse_likes)

    # Run the tests
    test_specific_scenarios()

Starting streaming dataset training...


Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Processing streaming dataset (max 10000 items)...
Item 0: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=750x597 at 0x207C64C0BF0>, 'artist': 22, 'genre': 4, 'style': 21}
Item 1: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1382x1659 at 0x207C6540230>, 'artist': 20, 'genre': 7, 'style': 4}
Item 2: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1382x1673 at 0x207C6540350>, 'artist': 16, 'genre': 6, 'style': 20}
Item 0: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=750x597 at 0x207C64C0BF0>, 'artist': 22, 'genre': 4, 'style': 21}
Item 1: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1382x1659 at 0x207C6540230>, 'artist': 20, 'genre': 7, 'style': 4}
Item 2: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1382x1673 at 0x207C6540350>, 'artist': 16, 'genre': 6, 'style': 20}
Processed 1000 items...
Processed 1000 items...
Processed 2000 items...
Processed 2000 items...
Processed 3000

In [5]:
# Save model
import os
import pickle
import json
from scipy.sparse import save_npz

def save_model_for_django():
    """Save trained model components for Django integration"""
    
    # Create models directory
    models_dir = "../../models"
    os.makedirs(models_dir, exist_ok=True)
    
    print("💾 SAVING MODEL FOR DJANGO...")
    
    # Save vectorizer
    with open(f"{models_dir}/vectorizer.pkl", 'wb') as f:
        pickle.dump(vectorizer, f)
    print("✅ Vectorizer saved")
    
    # Save TF-IDF matrix (sparse format)
    save_npz(f"{models_dir}/tfidf_matrix.npz", tfidf_matrix)
    print("✅ TF-IDF matrix saved")
    
    # Save metadata as JSON
    with open(f"{models_dir}/metadata.json", 'w') as f:
        json.dump(metadata, f, indent=2)
    print("✅ Metadata saved")
    
    # Save model info
    model_info = {
        "n_artworks": len(metadata),
        "n_features": tfidf_matrix.shape[1],
        "vocabulary_size": len(vectorizer.vocabulary_),
        "unique_artists": len(set(m['artist'] for m in metadata)),
        "unique_genres": len(set(m['genre'] for m in metadata)),
        "unique_styles": len(set(m['style'] for m in metadata)),
        "model_version": "1.0"
    }
    
    with open(f"{models_dir}/model_info.json", 'w') as f:
        json.dump(model_info, f, indent=2)
    print("✅ Model info saved")
    
    print(f"\n🎯 Model saved to: {models_dir}")
    print(f"📊 {model_info['n_artworks']} artworks, {model_info['vocabulary_size']} features")
    
    return models_dir

# Execute after training
if 'vectorizer' in locals() and vectorizer is not None:
    save_model_for_django()
    print("✅ MODEL READY FOR DJANGO!")
else:
    print("❌ Train the model first by running previous cells")

💾 SAVING MODEL FOR DJANGO...
✅ Vectorizer saved
✅ TF-IDF matrix saved
✅ Metadata saved
✅ Model info saved

🎯 Model saved to: ../../models
📊 10000 artworks, 48 features
✅ MODEL READY FOR DJANGO!
✅ TF-IDF matrix saved
✅ Metadata saved
✅ Model info saved

🎯 Model saved to: ../../models
📊 10000 artworks, 48 features
✅ MODEL READY FOR DJANGO!


In [6]:
# Cell: Test Django Integration
def test_django_integration():
    """Test that model can be loaded like Django would"""
    
    # Simulate Django loading
    import json
    from scipy.sparse import load_npz
    import pickle
    
    models_dir = "../../models"
    
    # Load like Django would
    with open(f"{models_dir}/vectorizer.pkl", 'rb') as f:
        test_vectorizer = pickle.load(f)
    
    test_matrix = load_npz(f"{models_dir}/tfidf_matrix.npz")
    
    with open(f"{models_dir}/metadata.json", 'r') as f:
        test_metadata = json.load(f)
    
    print("🧪 TESTING DJANGO INTEGRATION")
    print(f"✅ Vectorizer loaded: {len(test_vectorizer.vocabulary_)} features")
    print(f"✅ Matrix loaded: {test_matrix.shape}")
    print(f"✅ Metadata loaded: {len(test_metadata)} artworks")
    
    # Test recommendation
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(test_matrix[0:1], test_matrix).flatten()
    top_3 = similarities.argsort()[::-1][1:4]
    
    print(f"\n🎯 Test recommendation for artwork 0:")
    for i, idx in enumerate(top_3):
        artwork = test_metadata[idx]
        print(f"   {i+1}. {artwork['artist']} | {artwork['genre']} | {artwork['style']}")
    
    print("\n✅ DJANGO INTEGRATION READY!")

# Run the test
test_django_integration()

🧪 TESTING DJANGO INTEGRATION
✅ Vectorizer loaded: 48 features
✅ Matrix loaded: (10000, 48)
✅ Metadata loaded: 10000 artworks

🎯 Test recommendation for artwork 0:
   1. 22 | 4 | 21
   2. 22 | 4 | 21
   3. 22 | 4 | 21

✅ DJANGO INTEGRATION READY!
