In [None]:
import pandas as pd

df_import= pd.read_csv('data/final_dataset.csv')

# Print summary statistics
def summarize_dataset(df):
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Total tracks in dataset: {len(df)}")
    print(f"Tracks in your library: {df['in_library'].sum()}")
    print(f"Tracks not in your library: {(df['in_library'] == 0).sum()}")
    print(f"Percentage in library: {df['in_library'].mean() * 100:.2f}%")
    print("=" * 60)
    
    # Show sample of data
    print("\nSample of the final dataset:")
    print(df.head(10))
    print("\nColumns:", df.columns.tolist())
    print("\nLibrary tracks sample:")
    if df['in_library'].sum() > 0:
        print(df[df['in_library'] == 1].head(5))

summarize_dataset(df_import)

In [44]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

print(df_import.columns)
df = df_import.drop(columns=['id','mode'])

df.dtypes

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo', 'in_library'],
      dtype='object')


valence             float64
year                  int64
acousticness        float64
artists              object
danceability        float64
duration_ms           int64
energy              float64
explicit              int64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
name                 object
popularity            int64
release_date         object
speechiness         float64
tempo               float64
in_library            int64
dtype: object

Exploratory Data Analysis

In [None]:

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Assuming your dataframe is called 'df'
# df = pd.read_csv('your_dataset.csv')

# ============================================================================
# CHART 1: CLASS IMBALANCE - PIE CHART (High Quality)
# ============================================================================

fig, ax = plt.subplots(figsize=(10, 8))

# Get counts
counts = df['in_library'].value_counts()
labels = ['Not in Library', 'In Library']
colors = ['#ff6b6b', '#4ecdc4']
explode = (0, 0.1)  # Explode the small slice for visibility

# Create pie chart
wedges, texts, autotexts = ax.pie(counts, 
                                    labels=labels, 
                                    autopct='%1.2f%%',
                                    colors=colors, 
                                    startangle=90,
                                    explode=explode,
                                    textprops={'fontsize': 14, 'weight': 'bold'},
                                    pctdistance=0.85)

# Make percentage text larger and white
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(16)
    autotext.set_weight('bold')

# Make label text larger
for text in texts:
    text.set_fontsize(16)
    text.set_weight('bold')

ax.set_title('Class Imbalance', fontsize=20, fontweight='bold', pad=20)

# Add text box with actual counts
textstr = f'Not in Library: {counts[0]:,}\nIn Library: {counts[1]:,}\nRatio: {counts[0]/counts[1]:.1f}:1'
props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
ax.text(1.3, 0.5, textstr, transform=ax.transAxes, fontsize=12,
        verticalalignment='center', bbox=props)

plt.tight_layout()
plt.savefig('presentation_chart1_class_imbalance.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Class distribution:")
print(f"Not in Library: {counts[0]:,} ({counts[0]/len(df)*100:.2f}%)")
print(f"In Library: {counts[1]:,} ({counts[1]/len(df)*100:.2f}%)")
print(f"Imbalance ratio: {counts[0]/counts[1]:.1f}:1")

# ============================================================================
# CHART 2: CORRELATION HEATMAP (High Quality)
# ============================================================================

# Select numeric features
numeric_features = ['valence', 'acousticness', 'danceability', 'energy', 
                    'instrumentalness', 'liveness', 'loudness', 'speechiness', 
                    'tempo', 'duration_ms', 'popularity', 'in_library']

# Calculate correlation matrix
correlation_matrix = df[numeric_features].corr()

# Create figure
fig, ax = plt.subplots(figsize=(14, 12))

# Create heatmap
sns.heatmap(correlation_matrix, 
            annot=True,           # Show correlation values
            fmt='.2f',            # Format to 2 decimal places
            cmap='coolwarm',      # Color scheme
            center=0,             # Center colormap at 0
            square=True,          # Square cells
            linewidths=1,         # Lines between cells
            cbar_kws={"shrink": 0.8, "label": "Correlation Coefficient"},
            annot_kws={"fontsize": 10, "weight": "bold"},
            vmin=-1, vmax=1,      # Set color scale limits
            ax=ax)

# Customize
ax.set_title('Correlation Matrix of Audio Features', 
             fontsize=18, fontweight='bold', pad=20)

# Rotate labels for better readability
plt.xticks(rotation=45, ha='right', fontsize=11)
plt.yticks(rotation=0, fontsize=11)

plt.tight_layout()
plt.savefig('presentation_chart2_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# Print key correlations
print("\n" + "="*80)
print("KEY CORRELATIONS:")
print("="*80)

# Get correlations with target variable
target_corr = correlation_matrix['in_library'].sort_values(ascending=False)
print("\nCorrelations with 'in_library' (target variable):")
print(target_corr)

# Get strong feature-feature correlations
print("\nStrongest feature-feature correlations:")
# Get upper triangle of correlation matrix (to avoid duplicates)
mask = np.triu(np.ones_like(correlation_matrix), k=1).astype(bool)
high_corr = correlation_matrix.where(mask).stack().sort_values(ascending=False)
print(high_corr.head(10))

# ============================================================================
# CHART 3: VIOLIN PLOTS - AUDIO FEATURES BY TARGET (High Quality)
# ============================================================================

audio_features = ['valence', 'acousticness', 'danceability', 'energy', 
                  'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo']

fig, axes = plt.subplots(3, 3, figsize=(18, 14))
axes = axes.flatten()

# Color palette
colors = ['#ff6b6b', '#4ecdc4']  # Red for not liked, teal for liked

for idx, feature in enumerate(audio_features):
    ax = axes[idx]
    
    # Get data for each class
    df_not_liked = df[df['in_library'] == 0][feature].dropna()
    df_liked = df[df['in_library'] == 1][feature].dropna()
    
    # Create violin plot
    parts = ax.violinplot([df_not_liked, df_liked], 
                           positions=[1, 2],
                           showmeans=True, 
                           showmedians=True,
                           widths=0.7)
    
    # Color the violins
    for pc, color in zip(parts['bodies'], colors):
        pc.set_facecolor(color)
        pc.set_alpha(0.7)
        pc.set_edgecolor('black')
        pc.set_linewidth(1.5)
    
    # Color the other elements
    for partname in ('cbars', 'cmins', 'cmaxes', 'cmedians', 'cmeans'):
        if partname in parts:
            vp = parts[partname]
            vp.set_edgecolor('black')
            vp.set_linewidth(2)
    
    # Customize
    ax.set_title(f'{feature.capitalize()}: Liked vs Not Liked', 
                 fontweight='bold', fontsize=12)
    ax.set_ylabel(feature.capitalize(), fontsize=11)
    ax.set_xticks([1, 2])
    ax.set_xticklabels(['Not Liked', 'Liked'], fontsize=11, fontweight='bold')
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    
    # Add mean values as text
    mean_not_liked = df_not_liked.mean()
    mean_liked = df_liked.mean()
    ax.text(0.02, 0.98, f'Mean:\nNot Liked: {mean_not_liked:.2f}\nLiked: {mean_liked:.2f}',
            transform=ax.transAxes, fontsize=9, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.7))

plt.suptitle('Audio Features by Target Variable', 
             fontsize=20, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('presentation_chart3_violin_plots.png', dpi=300, bbox_inches='tight')
plt.show()

# Print summary statistics
print("\n" + "="*80)
print("FEATURE COMPARISON: LIKED vs NOT LIKED")
print("="*80)

for feature in audio_features:
    not_liked_mean = df[df['in_library']==0][feature].mean()
    not_liked_std = df[df['in_library']==0][feature].std()
    liked_mean = df[df['in_library']==1][feature].mean()
    liked_std = df[df['in_library']==1][feature].std()
    difference = liked_mean - not_liked_mean
    
    print(f"\n{feature.upper()}:")
    print(f"  Not Liked - Mean: {not_liked_mean:.3f}, Std: {not_liked_std:.3f}")
    print(f"  Liked     - Mean: {liked_mean:.3f}, Std: {liked_std:.3f}")
    print(f"  Difference: {difference:.3f} ({difference/not_liked_mean*100:+.1f}%)")

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

random_state = 42

#feature lists for preprocessing

continuous_features = [
    'valence', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'duration_ms']
integer_features = [
    'year', 'key', 'popularity', 'explicit']

num_features = continuous_features + integer_features
#check if there are missing values
df.isnull().values.any()

#prepares feature matrix
X = df[num_features]
#prepares target variable
y = df['in_library']

#split dataset into train, validation, and test sets with stratification. 60:20:20 ratio
X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,stratify=y,random_state=random_state)
X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,train_size = 0.5,stratify=y_other,random_state=random_state)

#standardize numerical features
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = pd.DataFrame(
    scaler.transform(X_train),
    columns=num_features,
    index=X_train.index
)

X_val_scaled = pd.DataFrame(
    scaler.transform(X_val),
    columns=num_features,
    index=X_val.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=num_features,
    index=X_test.index
)
