In [1]:
pwd


'/Users/spartan/Downloads/kafka_2.13-3.8.0/spotify_realtime/enhanced_spotify_streaming'

In [7]:
# spotify_fm_analysis.py

import mmh3
import numpy as np
from tabulate import tabulate
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time

class SpotifyFM:
    def __init__(self, client_id, client_secret):
        """Initialize with Spotify credentials"""
        self.spotify = spotipy.Spotify(
            client_credentials_manager=SpotifyClientCredentials(
                client_id=client_id,
                client_secret=client_secret
            )
        )
        self.fm_estimator = FlajoletMartin(num_estimators=32)
        
    def get_playlist_tracks(self, playlist_id):
        """Get tracks from a Spotify playlist"""
        results = self.spotify.playlist_tracks(playlist_id)
        tracks = []
        
        print("\nFetching playlist data...")
        for item in results['items']:
            if item['track']:
                track_info = {
                    'name': item['track']['name'],
                    'artist': item['track']['artists'][0]['name'],
                    'popularity': item['track']['popularity']
                }
                tracks.append(track_info)
                print(f"Found track: {track_info['name']} by {track_info['artist']}")
        
        return tracks
    
    def analyze_unique_tracks(self, playlist_id):
        """Analyze unique tracks in a playlist using FM algorithm"""
        tracks = self.get_playlist_tracks(playlist_id)
        
        print("\n=== Playlist Analysis ===")
        print(f"Total tracks fetched: {len(tracks)}")
        
        # Get actual unique count
        unique_tracks = set(track['name'] for track in tracks)
        print(f"Actual unique tracks: {len(unique_tracks)}")
        
        # Use FM algorithm
        for track in tracks:
            self.fm_estimator.add(track['name'])
        
        estimate = self.fm_estimator.estimate()
        error_percentage = abs(estimate - len(unique_tracks)) / len(unique_tracks) * 100
        
        print("\nFlajolet-Martin Analysis Results:")
        print(f"Estimated unique tracks: {estimate:.2f}")
        print(f"Error percentage: {error_percentage:.2f}%")
        
        # Show top artists
        artist_counts = {}
        for track in tracks:
            artist = track['artist']
            artist_counts[artist] = artist_counts.get(artist, 0) + 1
        
        print("\nTop Artists in Playlist:")
        sorted_artists = sorted(artist_counts.items(), key=lambda x: x[1], reverse=True)[:5]
        for artist, count in sorted_artists:
            print(f"- {artist}: {count} tracks")

class FlajoletMartin:
    """Flajolet-Martin Algorithm Implementation"""
    def __init__(self, num_estimators=32):
        self.num_estimators = num_estimators
        self.max_zeros = [0] * num_estimators
        self.item_history = {}
    
    def add(self, item):
        if item not in self.item_history:
            self.item_history[item] = []
        
        for i in range(self.num_estimators):
            hash_val = mmh3.hash(str(item), seed=i)
            binary = bin(hash_val)[2:] if hash_val >= 0 else bin(hash_val)[3:]
            trailing_zeros = len(binary) - len(binary.rstrip('0'))
            
            self.item_history[item].append({
                'estimator': i,
                'hash_value': hash_val,
                'trailing_zeros': trailing_zeros
            })
            
            self.max_zeros[i] = max(self.max_zeros[i], trailing_zeros)
    
    def estimate(self):
        avg_zeros = sum(self.max_zeros) / self.num_estimators
        return 2 ** avg_zeros
    
    def explain_estimate(self):
        """Explain the estimation process"""
        print("\n=== Estimation Process Details ===")
        print(f"Number of estimators used: {self.num_estimators}")
        print(f"Average trailing zeros: {sum(self.max_zeros) / self.num_estimators:.2f}")
        print("\nSample of processed items:")
        
        # Show details for first 3 items
        for i, (item, hashes) in enumerate(list(self.item_history.items())[:3]):
            print(f"\nItem {i+1}: {item}")
            data = []
            for h in hashes[:3]:  # Show first 3 hash functions
                data.append([
                    h['estimator'],
                    h['hash_value'],
                    h['trailing_zeros']
                ])
            print(tabulate(data, 
                         headers=['Hash Function', 'Hash Value', 'Trailing Zeros'],
                         tablefmt='grid'))

def main():
    # Your Spotify API credentials
    CLIENT_ID = '95a8aadaabde43e2977306a50410c381'
    CLIENT_SECRET = 'dcb6c9927af244d2a293d52d6634c21d'
    
    # Initialize SpotifyFM analyzer
    spotify_fm = SpotifyFM(CLIENT_ID, CLIENT_SECRET)
    
    # Analyze different playlists
    playlists = [
        ('37i9dQZEVXbMDoHDwVN2tF', 'Global Top 50'),
        ('37i9dQZEVXbLRQDuF5jeBp', 'US Top 50'),
    ]
    
    for playlist_id, name in playlists:
        print(f"\n=== Analyzing Playlist: {name} ===")
        spotify_fm.analyze_unique_tracks(playlist_id)
        spotify_fm.fm_estimator.explain_estimate()
        time.sleep(1)  # Respect API rate limits

if __name__ == "__main__":
    main()


=== Analyzing Playlist: Global Top 50 ===

Fetching playlist data...
Found track: APT. by ROSÉ
Found track: Die With A Smile by Lady Gaga
Found track: luther (with sza) by Kendrick Lamar
Found track: squabble up by Kendrick Lamar
Found track: BIRDS OF A FEATHER by Billie Eilish
Found track: tv off (feat. lefty gunplay) by Kendrick Lamar
Found track: That’s So True by Gracie Abrams
Found track: Who by Jimin
Found track: wacced out murals by Kendrick Lamar
Found track: All I Want for Christmas Is You by Mariah Carey
Found track: Sailor Song by Gigi Perez
Found track: Running Wild by Jin
Found track: Last Christmas - Single Version by Wham!
Found track: Tu Boda by Oscar Maydon
Found track: hey now (feat. dody6) by Kendrick Lamar
Found track: reincarnated by Kendrick Lamar
Found track: man at the garden by Kendrick Lamar
Found track: WILDFLOWER by Billie Eilish
Found track: Espresso by Sabrina Carpenter
Found track: dodger blue (feat. wallie the sensei, siete7x, roddy ricch) by Kendrick L

In [1]:
import mmh3  # for Bloom Filter
import math
import random
from collections import defaultdict
            
class BloomFilter:
    def __init__(self, size, num_hash_functions):
        self.size = size  
        self.num_hash_functions = num_hash_functions
        self.bit_array = [0] * size
    
    def add(self, item):
        for seed in range(self.num_hash_functions):
            index = mmh3.hash(str(item), seed) % self.size
            self.bit_array[index] = 1
    
    def check(self, item):
        for seed in range(self.num_hash_functions):
            index = mmh3.hash(str(item), seed) % self.size   
            if self.bit_array[index] == 0:
                return False
        return True
    
class ReservoirSampling:
    def __init__(self, k):
        self.k = k
        self.reservoir = []
        self.count = 0
        
    def add(self, item):
        self.count += 1
        if len(self.reservoir) < self.k: 
            self.reservoir.append(item)
        else:
            j = random.randrange(self.count)
            if j < self.k:
                self.reservoir[j] = item
     
    def get_sample(self):
        return self.reservoir

def apply_streaming_algorithms(df):
    """Apply streaming algorithms to the data"""
    # Initialize Bloom Filter for popular songs (popularity > 90)
    bloom = BloomFilter(size=1000, num_hash_functions=3)

    # Initialize Reservoir Sampling for random song selection
    reservoir = ReservoirSampling(k=10)

    # Process each song
    for song in df.collect():
        # Add highly popular songs to Bloom Filter
        if song.popularity > 90:
            bloom.add(song.name)

        # Add all songs to reservoir sampling
        reservoir.add({
            'name': song.name,
            'artist': song.artist,
            'popularity': song.popularity
        })
        
    return {
        'bloom_filter': bloom,
        'reservoir_sample': reservoir.get_sample()
    }
    
        

        

In [None]:
# fairness_spotify.py

from kafka import KafkaConsumer
import json
from collections import defaultdict, deque
import math
import numpy as np
from datetime import datetime

class FairnessMetrics:
    def __init__(self, window_size=100):
        # Window size for sliding window analysis
        self.window_size = window_size
        
        # Sliding windows for different attributes
        self.track_window = deque(maxlen=window_size)
        self.artist_stats = defaultdict(int)
        self.genre_stats = defaultdict(int)
        
        # Track statistics
        self.total_tracks = 0
        self.total_artists = set()
        self.total_genres = set()
        
        # Thresholds
        self.popularity_threshold = 80
        self.visibility_threshold = 0.7
        
        # Protected attributes tracking
        self.protected_attributes = {
            'artist_exposure': defaultdict(float),
            'genre_representation': defaultdict(float),
            'popularity_distribution': defaultdict(list)
        }
    
    def add_track(self, track_data):
        """Add a track to the sliding window"""
        try:
            # Add to window
            self.track_window.append(track_data)
            
            # Update statistics
            self.total_tracks += 1
            self.total_artists.add(track_data['artist'])
            
            # Update artist stats
            self.artist_stats[track_data['artist']] += 1
            
            # Update protected attributes
            self._update_protected_attributes(track_data)
            
            return self.calculate_fairness_metrics()
            
        except Exception as e:
            print(f"Error adding track to fairness metrics: {e}")
            return None

    def _update_protected_attributes(self, track_data):
        """Update protected attributes statistics"""
        artist = track_data['artist']
        popularity = float(track_data.get('popularity', 0))
        
        # Update artist exposure
        self.protected_attributes['artist_exposure'][artist] = (
            self.artist_stats[artist] / self.total_tracks
        )
        
        # Update popularity distribution
        self.protected_attributes['popularity_distribution'][artist].append(popularity)
        if len(self.protected_attributes['popularity_distribution'][artist]) > self.window_size:
            self.protected_attributes['popularity_distribution'][artist].pop(0)

    def calculate_fairness_metrics(self):
        """Calculate comprehensive fairness metrics"""
        metrics = {
            'representation_metrics': self._calculate_representation_metrics(),
            'exposure_metrics': self._calculate_exposure_metrics(),
            'popularity_metrics': self._calculate_popularity_metrics(),
            'opportunity_metrics': self._calculate_opportunity_metrics()
        }
        return metrics

    def _calculate_representation_metrics(self):
        """Calculate representation balance using entropy"""
        metrics = {}
        
        # Artist representation entropy
        artist_counts = np.array(list(self.artist_stats.values()))
        total_tracks = sum(artist_counts)
        
        if total_tracks > 0:
            probabilities = artist_counts / total_tracks
            entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))
            max_entropy = np.log2(len(self.artist_stats))
            normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
            
            metrics['artist_representation'] = {
                'entropy': entropy,
                'normalized_entropy': normalized_entropy,
                'unique_artists': len(self.total_artists)
            }
        
        return metrics

    def _calculate_exposure_metrics(self):
        """Calculate exposure fairness metrics"""
        exposures = list(self.protected_attributes['artist_exposure'].values())
        
        if exposures:
            return {
                'min_exposure': min(exposures),
                'max_exposure': max(exposures),
                'exposure_disparity': max(exposures) - min(exposures),
                'exposure_std': np.std(exposures)
            }
        return {}

    def _calculate_popularity_metrics(self):
        """Calculate popularity-based fairness metrics"""
        metrics = {}
        
        # Calculate popularity disparities across artists
        artist_avg_popularity = {}
        for artist, popularities in self.protected_attributes['popularity_distribution'].items():
            if popularities:
                artist_avg_popularity[artist] = np.mean(popularities)
        
        if artist_avg_popularity:
            popularities = list(artist_avg_popularity.values())
            metrics['popularity_disparity'] = max(popularities) - min(popularities)
            metrics['popularity_std'] = np.std(popularities)
            metrics['popularity_range'] = {
                'min': min(popularities),
                'max': max(popularities)
            }
        
        return metrics

    def _calculate_opportunity_metrics(self):
        """Calculate equal opportunity metrics"""
        metrics = {}
        
        # Calculate opportunity rates (chance of high popularity)
        opportunity_rates = {}
        for artist, popularities in self.protected_attributes['popularity_distribution'].items():
            if popularities:
                high_popularity_rate = sum(1 for p in popularities if p >= self.popularity_threshold)
                opportunity_rates[artist] = high_popularity_rate / len(popularities)
        
        if opportunity_rates:
            rates = list(opportunity_rates.values())
            metrics['opportunity_disparity'] = max(rates) - min(rates)
            metrics['opportunity_std'] = np.std(rates)
            metrics['min_opportunity'] = min(rates)
            metrics['max_opportunity'] = max(rates)
        
        return metrics

    def get_fairness_summary(self):
        """Get a human-readable summary of fairness metrics"""
        metrics = self.calculate_fairness_metrics()
        
        summary = []
        summary.append("\nFairness Metrics Summary")
        summary.append("=" * 50)
        
        # Representation metrics
        if 'representation_metrics' in metrics:
            rep = metrics['representation_metrics']
            summary.append("\nRepresentation Metrics:")
            summary.append(f"- Normalized Artist Entropy: {rep.get('normalized_entropy', 0):.3f}")
            summary.append(f"- Unique Artists: {rep.get('unique_artists', 0)}")
        
        # Exposure metrics
        if 'exposure_metrics' in metrics:
            exp = metrics['exposure_metrics']
            summary.append("\nExposure Metrics:")
            summary.append(f"- Exposure Disparity: {exp.get('exposure_disparity', 0):.3f}")
            summary.append(f"- Exposure Std Dev: {exp.get('exposure_std', 0):.3f}")
        
        # Popularity metrics
        if 'popularity_metrics' in metrics:
            pop = metrics['popularity_metrics']
            summary.append("\nPopularity Metrics:")
            summary.append(f"- Popularity Disparity: {pop.get('popularity_disparity', 0):.3f}")
            summary.append(f"- Popularity Std Dev: {pop.get('popularity_std', 0):.3f}")
        
        # Opportunity metrics
        if 'opportunity_metrics' in metrics:
            opp = metrics['opportunity_metrics']
            summary.append("\nOpportunity Metrics:")
            summary.append(f"- Opportunity Disparity: {opp.get('opportunity_disparity', 0):.3f}")
            summary.append(f"- Min Opportunity: {opp.get('min_opportunity', 0):.3f}")
            summary.append(f"- Max Opportunity: {opp.get('max_opportunity', 0):.3f}")
        
        return "\n".join(summary)

def main():
    print("Starting Spotify Fairness Analysis...")
    
    try:
        # Create Kafka consumer
        consumer = KafkaConsumer(
            'spotify_stream',
            bootstrap_servers=['localhost:9092'],
            auto_offset_reset='latest',
            value_deserializer=lambda x: json.loads(x.decode('utf-8')),
            group_id='fairness_group'
        )
        
        # Initialize fairness metrics
        fairness_analyzer = FairnessMetrics(window_size=100)
        
        print("Waiting for messages...")
        
        # Process messages
        for message in consumer:
            try:
                track_data = message.value
                
                # Add track and get updated metrics
                fairness_analyzer.add_track(track_data)
                
                # Print current fairness summary
                print(f"\nProcessing track: {track_data['name']} by {track_data['artist']}")
                print(fairness_analyzer.get_fairness_summary())
                
            except Exception as e:
                print(f"Error processing message: {e}")
            
    except KeyboardInterrupt:
        print("\nAnalysis stopped by user")
    except Exception as e:
        print(f"Error in main loop: {e}")
    finally:
        if 'consumer' in locals():
            consumer.close()

if __name__ == "__main__":
    main()

Starting Spotify Fairness Analysis...
Waiting for messages...

Processing track: APT. by ROSÉ

Fairness Metrics Summary

Representation Metrics:
- Normalized Artist Entropy: 0.000
- Unique Artists: 0

Exposure Metrics:
- Exposure Disparity: 0.000
- Exposure Std Dev: 0.000

Popularity Metrics:
- Popularity Disparity: 0.000
- Popularity Std Dev: 0.000

Opportunity Metrics:
- Opportunity Disparity: 0.000
- Min Opportunity: 1.000
- Max Opportunity: 1.000

Processing track: Die With A Smile by Lady Gaga

Fairness Metrics Summary

Representation Metrics:
- Normalized Artist Entropy: 0.000
- Unique Artists: 0

Exposure Metrics:
- Exposure Disparity: 0.500
- Exposure Std Dev: 0.250

Popularity Metrics:
- Popularity Disparity: 2.000
- Popularity Std Dev: 1.000

Opportunity Metrics:
- Opportunity Disparity: 0.000
- Min Opportunity: 1.000
- Max Opportunity: 1.000

Processing track: luther (with sza) by Kendrick Lamar

Fairness Metrics Summary

Representation Metrics:
- Normalized Artist Entropy: 

In [None]:
pwd