<a href="https://colab.research.google.com/github/eagning1/DE1-G29/blob/main/DE1_G29.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#pip install nltk

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from collections import Counter, defaultdict
import re
import warnings
from timeit import default_timer as timer
warnings.filterwarnings('ignore')

In [3]:
# Download VADER lexicon if not already downloaded
try:
    nltk.data.find('vader_lexicon')
except LookupError:
    nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\erkoo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# Set the style for our plots
plt.style.use('ggplot')
sns.set(font_scale=1.2)

In [5]:
class LyricsAnalyzer:
    def __init__(self, matches_file, train_file):
        """
        Initialize the LyricsAnalyzer with the paths to the dataset files.
        Parameters:
        -----------
        matches_file : str
            Path to the mxm_779k_matches.txt file
        train_file : str
            Path to the mxm_dataset_train.txt file
        """
        self.matches_file = matches_file
        self.train_file = train_file
        self.top_words = []
        self.matches_df = None
        self.lyrics_df = None
        self.word_counts = None
        self.sentiment_analyzer = SentimentIntensityAnalyzer()
        
    def load_matches(self):
        """Load and parse the matches file"""
        print("Loading matches data...")
        # Define column names based on the file format
        columns = ['msd_track_id', 'msd_artist_name', 'msd_title', 'mxm_track_id', 'mxm_artist_name', 'mxm_title']
        
        # Read the file
        data = []
        with open(self.matches_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('#'):  # Skip comment lines
                    continue
                parts = line.strip().split('<SEP>')
                if len(parts) == 6:  # Ensure we have all fields
                    data.append(parts)
        
        # Create DataFrame
        self.matches_df = pd.DataFrame(data, columns=columns)
        print(f"Loaded {len(self.matches_df)} song matches.")
        return self.matches_df
    
    def load_lyrics(self):
        """Load and parse the lyrics training file"""
        print("Loading lyrics data...")
        # First, extract the top words list
        self.top_words = []
        word_counts_data = []
        
        with open(self.train_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('#'):  # Skip comment lines
                    continue
                elif line.startswith('%'):  # Extract top words
                    self.top_words = line[1:].strip().split(',')
                else:
                    # Parse the word count line
                    parts = line.strip().split(',')
                    if len(parts) >= 2:
                        track_id = parts[0]
                        mxm_track_id = parts[1]
                        
                        # Create a sparse word count dictionary
                        word_counts = {}
                        for item in parts[2:]:
                            if ':' in item:
                                idx, count = item.split(':')
                                # Convert to 0-based index and ensure it's within range
                                word_idx = int(idx) - 1  # 1-based to 0-based
                                if word_idx < len(self.top_words):
                                    word_counts[self.top_words[word_idx]] = int(count)
                        
                        word_counts_data.append({
                            'track_id': track_id,
                            'mxm_track_id': mxm_track_id,
                            'word_counts': word_counts
                        })
        
        # Create DataFrame
        self.lyrics_df = pd.DataFrame(word_counts_data)
        print(f"Loaded lyrics data for {len(self.lyrics_df)} songs with {len(self.top_words)} vocabulary words.")
        return self.lyrics_df
    
    def merge_data(self):
        """Merge the matches and lyrics data"""
        if self.matches_df is None:
            self.load_matches()
        if self.lyrics_df is None:
            self.load_lyrics()
        
        # Merge on mxm_track_id
        merged_df = pd.merge(
            self.lyrics_df,
            self.matches_df,
            left_on='mxm_track_id',
            right_on='mxm_track_id',
            how='inner'
        )
        
        print(f"Merged data contains {len(merged_df)} songs.")
        return merged_df
    
    def create_word_count_matrix(self):
        """Convert sparse word counts to a matrix format"""
        if self.lyrics_df is None:
            self.load_lyrics()
        
        # Create a matrix where rows are songs and columns are words
        word_count_matrix = np.zeros((len(self.lyrics_df), len(self.top_words)))
        
        for i, row in enumerate(self.lyrics_df['word_counts']):
            for word, count in row.items():
                if word in self.top_words:
                    col_idx = self.top_words.index(word)
                    word_count_matrix[i, col_idx] = count
        
        self.word_counts = pd.DataFrame(word_count_matrix, columns=self.top_words)
        return self.word_counts
    
    def calculate_sentiment(self, merged_data=None):
        """Calculate sentiment scores for each song based on its word counts"""
        if merged_data is None:
            merged_data = self.merge_data()
        
        sentiment_scores = []
        
        for _, row in merged_data.iterrows():
            word_counts = row['word_counts']
            
            # Convert word counts to a pseudo-text for VADER
            # Repeat each word by its count to give it proper weight
            pseudo_text = ' '.join([f"{word} " * count for word, count in word_counts.items()])
            
            # Get sentiment scores
            sentiment = self.sentiment_analyzer.polarity_scores(pseudo_text)
            sentiment_scores.append({
                'track_id': row['track_id'],
                'artist': row['msd_artist_name'],
                'title': row['msd_title'],
                'negative': sentiment['neg'],
                'neutral': sentiment['neu'],
                'positive': sentiment['pos'],
                'compound': sentiment['compound']
            })
        
        sentiment_df = pd.DataFrame(sentiment_scores)
        return sentiment_df

    def load_genre_data(self, genre_file):
        """Load genre assignments for tracks"""
        print("Loading genre data...")
        genre_data = {}
        
        with open(genre_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('#'):  # Skip comment lines
                    continue
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    track_id, genre = parts
                    genre_data[track_id] = genre
        
        print(f"Loaded genre data for {len(genre_data)} songs.")
        return genre_data
    
    def merge_genre_data(self, genre_data, merged_data=None):
        """Add genre information to the merged dataset"""
        if merged_data is None:
            merged_data = self.merge_data()
        
        # Create a new column for genre
        merged_data['genre'] = merged_data['track_id'].map(genre_data)
        
        # Filter out tracks without genre information
        merged_data_with_genre = merged_data.dropna(subset=['genre'])
        print(f"Merged data contains {len(merged_data_with_genre)} songs with genre information.")
        
        return merged_data_with_genre
    
    def prepare_features_for_classification(self, data=None, remove_words=None):
        """Prepare features for genre classification, optionally removing specific words"""
        if data is None:
            data = self.merge_data()
            
        if remove_words is None:
            remove_words = ['i', 'the', 'a', 'to', 'you', 'and', 'me']
        
        # Create features from word counts
        features = []
        
        for _, row in data.iterrows():
            word_counts = row['word_counts']
            # Remove specified words
            filtered_counts = {word: count for word, count in word_counts.items() 
                             if word not in remove_words}
            features.append(filtered_counts)
        
        # Convert to matrix format
        from sklearn.feature_extraction import DictVectorizer
        vectorizer = DictVectorizer(sparse=True)
        X = vectorizer.fit_transform(features)
        
        return X, vectorizer
    
    def train_genre_classifier(self, X_train, y_train):
        """Train an SVM classifier for genre prediction"""
        from sklearn.svm import SVC
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline
        
        # Create a pipeline with scaling and SVM
        pipeline = Pipeline([
            ('scaler', StandardScaler(with_mean=False)),  # Don't subtract mean for sparse data
            ('svm', SVC(kernel='linear', C=1.0, random_state=42))
        ])
        
        # Train the model
        print("Training SVM classifier for genre prediction...")
        pipeline.fit(X_train, y_train)
        
        return pipeline
    
    def evaluate_genre_classifier(self, model, X_test, y_test):
        """Evaluate the genre classifier"""
        from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
        import seaborn as sns
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {accuracy:.4f}")
        
        # Classification report
        report = classification_report(y_test, y_pred)
        print("\nClassification Report:")
        print(report)
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=sorted(set(y_test)),
                   yticklabels=sorted(set(y_test)))
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        
        return accuracy, report, plt
    
    def load_test_data(self, test_file):
        """Load and parse the test dataset"""
        print("Loading test data...")
        test_words = []
        test_data = []
        
        with open(test_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('#'):  # Skip comment lines
                    continue
                elif line.startswith('%'):  # Extract words list
                    test_words = line[1:].strip().split(',')
                else:
                    # Parse the word count line
                    parts = line.strip().split(',')
                    if len(parts) >= 2:
                        track_id = parts[0]
                        mxm_track_id = parts[1]
                        
                        # Create a sparse word count dictionary
                        word_counts = {}
                        for item in parts[2:]:
                            if ':' in item:
                                idx, count = item.split(':')
                                # Convert to 0-based index and ensure it's within range
                                word_idx = int(idx) - 1  # 1-based to 0-based
                                if word_idx < len(test_words):
                                    word_counts[test_words[word_idx]] = int(count)
                        
                        test_data.append({
                            'track_id': track_id,
                            'mxm_track_id': mxm_track_id,
                            'word_counts': word_counts
                        })
        
        test_df = pd.DataFrame(test_data)
        print(f"Loaded test data for {len(test_df)} songs with {len(test_words)} vocabulary words.")
        
        # Ensure test_words match training words by appending any missing ones
        if len(test_words) != len(self.top_words) or test_words != self.top_words:
            print("Warning: Test vocabulary differs from training vocabulary.")
            
        return test_df, test_words
    
    def predict_genres(self, model, vectorizer, test_df, genre_data=None, remove_words=None):
        """Predict genres for the test dataset"""
        if remove_words is None:
            remove_words = ['i', 'the', 'a', 'to', 'you', 'and', 'me']
        
        # Prepare test features
        test_features = []
        for _, row in test_df.iterrows():
            word_counts = row['word_counts']
            # Remove specified words
            filtered_counts = {word: count for word, count in word_counts.items() 
                             if word not in remove_words}
            test_features.append(filtered_counts)
        
        # Transform using the same vectorizer as training
        X_test = vectorizer.transform(test_features)
        
        # Make predictions
        print("Predicting genres for test data...")
        predicted_genres = model.predict(X_test)
        
        # Add predictions to test data
        predictions_df = test_df.copy()
        predictions_df['predicted_genre'] = predicted_genres
        
        # If ground truth is available, add it
        if genre_data is not None:
            predictions_df['actual_genre'] = predictions_df['track_id'].map(genre_data)
            
            # Calculate accuracy for test data with known genres
            mask = ~predictions_df['actual_genre'].isna()
            if mask.sum() > 0:
                accuracy = (predictions_df.loc[mask, 'predicted_genre'] == 
                           predictions_df.loc[mask, 'actual_genre']).mean()
                print(f"Test accuracy: {accuracy:.4f} (for {mask.sum()} songs with known genres)")
        
        return predictions_df
    
    def reduce_dimensions(self, method='pca', n_components=2):
        """Reduce dimensions of word count data for visualization"""
        if self.word_counts is None:
            self.create_word_count_matrix()
        
        # Normalize word counts
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(self.word_counts)
        
        if method.lower() == 'pca':
            reducer = PCA(n_components=n_components)
        elif method.lower() == 'tsne':
            reducer = TSNE(n_components=n_components, random_state=42)
        else:
            raise ValueError("Method must be either 'pca' or 'tsne'")
        
        reduced_features = reducer.fit_transform(scaled_features)
        
        return reduced_features
    
    def plot_most_common_words(self, top_n=20):
        """Plot the most common words across all songs"""
        if self.word_counts is None:
            self.create_word_count_matrix()
        
        # Sum word counts across all songs
        total_counts = self.word_counts.sum().sort_values(ascending=False)
        top_words = total_counts.head(top_n)
        
        plt.figure(figsize=(12, 8))
        sns.barplot(x=top_words.values, y=top_words.index)
        plt.title(f'Top {top_n} Most Common Words in Lyrics')
        plt.xlabel('Count')
        plt.ylabel('Word')
        plt.tight_layout()
        return plt
    
    def plot_sentiment_distribution(self, sentiment_df=None):
        """Plot the distribution of sentiment scores"""
        if sentiment_df is None:
            sentiment_df = self.calculate_sentiment()
        
        fig, axs = plt.subplots(2, 2, figsize=(14, 10))
        
        # Plot distributions of each sentiment score
        sns.histplot(sentiment_df['negative'], kde=True, ax=axs[0, 0], color='red')
        axs[0, 0].set_title('Negative Sentiment Distribution')
        
        sns.histplot(sentiment_df['neutral'], kde=True, ax=axs[0, 1], color='gray')
        axs[0, 1].set_title('Neutral Sentiment Distribution')
        
        sns.histplot(sentiment_df['positive'], kde=True, ax=axs[1, 0], color='green')
        axs[1, 0].set_title('Positive Sentiment Distribution')
        
        sns.histplot(sentiment_df['compound'], kde=True, ax=axs[1, 1], color='blue')
        axs[1, 1].set_title('Compound Sentiment Distribution')
        
        plt.tight_layout()
        return fig, axs
    
    def analyze_artist(self, artist_name):
        """Analyze lyrics patterns for a specific artist"""
        merged_data = self.merge_data()
        
        # Filter for the artist
        artist_data = merged_data[merged_data['msd_artist_name'].str.lower() == artist_name.lower()]
        
        if len(artist_data) == 0:
            print(f"No data found for artist: {artist_name}")
            return None
        
        print(f"Analyzing {len(artist_data)} songs by {artist_name}")
        
        # Calculate sentiment for the artist's songs
        artist_sentiment = self.calculate_sentiment(artist_data)
        
        # Get most common words for this artist
        all_words = Counter()
        for word_counts in artist_data['word_counts']:
            all_words.update(word_counts)
        
        return {
            'artist_data': artist_data,
            'sentiment': artist_sentiment,
            'common_words': all_words
        }
    
    def compare_genres(self, genre_artist_mapping):
        """
        Compare lyrics and sentiment patterns across different genres
        
        Parameters:
        -----------
        genre_artist_mapping : dict
            Dictionary mapping genre names to lists of artists in that genre
        """
        merged_data = self.merge_data()
        genre_stats = {}
        
        for genre, artists in genre_artist_mapping.items():
            # Get data for all artists in this genre
            genre_data = merged_data[merged_data['msd_artist_name'].str.lower().isin([a.lower() for a in artists])]
            
            if len(genre_data) == 0:
                print(f"No data found for genre: {genre}")
                continue
                
            print(f"Analyzing {len(genre_data)} songs in the {genre} genre")
            
            # Calculate sentiment 
            genre_sentiment = self.calculate_sentiment(genre_data)
            
            # Get most common words for this genre
            genre_words = Counter()
            for word_counts in genre_data['word_counts']:
                genre_words.update(word_counts)
            
            genre_stats[genre] = {
                'data': genre_data,
                'sentiment': genre_sentiment,
                'common_words': genre_words
            }
        
        return genre_stats
    
    def plot_sentiment_by_genre(self, genre_stats):
        """Plot comparison of sentiment across genres"""
        if not genre_stats:
            return None
        
        # Prepare data for plotting
        genre_names = list(genre_stats.keys())
        sentiment_means = {
            'negative': [genre_stats[g]['sentiment']['negative'].mean() for g in genre_names],
            'neutral': [genre_stats[g]['sentiment']['neutral'].mean() for g in genre_names],
            'positive': [genre_stats[g]['sentiment']['positive'].mean() for g in genre_names],
            'compound': [genre_stats[g]['sentiment']['compound'].mean() for g in genre_names]
        }
        
        # Create bar plot
        fig, ax = plt.subplots(figsize=(12, 8))
        x = np.arange(len(genre_names))
        width = 0.2
        
        ax.bar(x - width*1.5, sentiment_means['negative'], width, label='Negative', color='red', alpha=0.7)
        ax.bar(x - width*0.5, sentiment_means['neutral'], width, label='Neutral', color='gray', alpha=0.7)
        ax.bar(x + width*0.5, sentiment_means['positive'], width, label='Positive', color='green', alpha=0.7)
        ax.bar(x + width*1.5, sentiment_means['compound'], width, label='Compound', color='blue', alpha=0.7)
        
        ax.set_xticks(x)
        ax.set_xticklabels(genre_names, rotation=45, ha='right')
        ax.set_ylabel('Mean Sentiment Score')
        ax.set_title('Sentiment Analysis by Genre')
        ax.legend()
        
        plt.tight_layout()
        return plt
        
    def run_full_analysis(self, genre_file='genre_assignment.txt', test_file='mxm_dataset_test.txt'):
        """Run a complete analysis pipeline on the dataset with genre prediction"""
        # Load and merge data
        start_merge_data = timer()
        merged_data = self.merge_data()
        end_merge_data = timer()
        
        # Create word count matrix
        self.create_word_count_matrix()
        
        # Calculate sentiment
        start_sentiment = timer()
        sentiment_df = self.calculate_sentiment(merged_data)
        end_sentiment = timer()
        
        # Load genre data
        genre_data = self.load_genre_data(genre_file)
        
        # Merge genre data
        merged_data_with_genre = self.merge_genre_data(genre_data, merged_data)
        
        # Prepare features for classification
        remove_words = ['i', 'the', 'a', 'to', 'you', 'and', 'me']
        X, vectorizer = self.prepare_features_for_classification(merged_data_with_genre, remove_words)
        y = merged_data_with_genre['genre']

        
        # Split data for training and evaluation
        from sklearn.model_selection import train_test_split
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Train genre classifier
        start_svm_train = timer()
        model = self.train_genre_classifier(X_train, y_train)
        end_svm_train = timer()
        
        # Evaluate on validation set
        accuracy, report, confusion_plot = self.evaluate_genre_classifier(model, X_val, y_val)
        
        # Load test data
        test_df, test_words = self.load_test_data(test_file)
        
        # Predict genres for test data
        predictions_df = self.predict_genres(model, vectorizer, test_df, genre_data, remove_words)
        
        # Generate plots
        plots = {
            #'common_words': self.plot_most_common_words(top_n=20),
            'sentiment_distribution': self.plot_sentiment_distribution(sentiment_df),
            'genre_visualization': self.plot_genre_visualization(merged_data_with_genre),
            'confusion_matrix': confusion_plot
        }

        timers = {
                'merge_data': [end_merge_data-start_merge_data],
                'sentiment': [end_sentiment-start_sentiment],
                'training': [end_svm_train-start_svm_train]
        }
        
        return {
            'merged_data': merged_data,
            'merged_data_with_genre': merged_data_with_genre,
            'sentiment': sentiment_df,
            'genre_model': model,
            'vectorizer': vectorizer,
            'validation_accuracy': accuracy,
            'validation_report': report,
            'test_predictions': predictions_df,
            'plots': plots
        }

In [None]:
# Initialize the analyzer with file paths
analyzer = LyricsAnalyzer('mxm_779k_matches.txt', 'mxm_dataset_train.txt')

# Run the analysis
results = analyzer.run_full_analysis()

# Show plots
#results['plots']['common_words'].show()
results['plots']['sentiment_distribution'][0].show()
#results['plots']['clusters'].show()

# Example of comparing genres
# genre_mapping = {
#     'Rock': ['Queen', 'The Beatles', 'Led Zeppelin', 'Pink Floyd', 'Rolling Stones'],
#     'Pop': ['Michael Jackson', 'Madonna', 'Prince'],
#     'Rap': ['Eminem', 'Jay-Z', 'Tupac']
# } 

# genre_stats = analyzer.compare_genres(genre_mapping)
# analyzer.plot_sentiment_by_genre(genre_stats).show()

Loading matches data...
Loaded 779056 song matches.
Loading lyrics data...
Loaded lyrics data for 210519 songs with 5000 vocabulary words.
Merged data contains 265353 songs.
Loading genre data...
Loaded genre data for 422714 songs.
Merged data contains 137227 songs with genre information.
Training SVM classifier for genre prediction...


In [None]:
results['timers']

In [None]:
results

In [None]:
results['merged_data']

In [None]:
results['sentiment']

In [None]:
results['clusters'].describe(include='all')  

In [None]:
# Example of comparing genres
genre_mapping = {
    'ABBA': ['ABBA'],
    'Nirvana': ['Nirvana'],
    'Taylor Swift': ['Taylor Swift'],
    'Michael Jackson': ['Michael Jackson'],
    'Eminem': ['Eminem'],
    'Bob Dylan': ['Bob Dylan'],
    'Joy Division': ['Joy Division'],
    'Radiohead': ['Radiohead'],
    'The Smiths': ['The Smiths'],
    'Lady Gaga': ['Lady Gaga'],
    'Linkin Park': ['Linkin Park'],
    'Travis Scott': ['Travis Scott'],
    'Kendrick Lamar': ['Kendrick Lamar']
}

genre_stats = analyzer.compare_genres(genre_mapping)
analyzer.plot_sentiment_by_genre(genre_stats).show()

# Problem description

# Workbook Setup

# Pre-processing

# Analysis