In [None]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import os 

In [None]:
df = pd.read_csv('features_20250725.csv', header=0)
df['tempo'] = df['tempo'].str.extract(r'(\d+\.?\d*)').astype(float)
feature_cols = [col for col in df.columns if col not in ['audio_file','transcript_id','start_time','end_time','duration','confidence', 'target']]
feature_data = df[feature_cols].copy()
feature_data.head()

In [None]:
generate_corr_matix(feature_data)

In [None]:
correlated_features = ['spectral_rolloff_85_mean', 'chroma_var', 'amplitude_mean', 'amplitude_std',
                        'spectral_rolloff_95_mean', 'dynamic_range', 'spectral_rolloff_mean', 
                        ]
feature_data_reduced = feature_data.drop(correlated_features, axis=1, inplace=False).copy()

generate_corr_matix(feature_data_reduced)

In [None]:
def generate_corr_matix(df):

    correlation_matrix = df.corr()
            
    # Plot heatmap
    plt.figure(figsize=(16, 14))
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": 0.5})
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig('feature_correlations.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Find highly correlated features
    high_corr = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) > 0.8:
                high_corr.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    corr_val
                ))

    if high_corr:
        high_corr_features = []
        print(f"\nHighly correlated features (|r| > 0.8):")
        for feat1, feat2, corr in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True)[:10]:
            print(f"  {feat1} â†” {feat2}: {corr:.3f}")
            high_corr_features.append(feat2)
        
        print(high_corr_features)

In [None]:
correlated_features_v2 = ['poly_features_mean', 'zcr_std', 'beats_per_second', 'poly_features_std', 'amplitude_min', 'mfcc_2_mean', 'poly_features_std', 'poly_features_std', 'spectral_flatness_std', 'rms_std']
feature_data_reduced_v2 = feature_data_reduced.drop(correlated_features_v2, axis=1, inplace=False).copy()

generate_corr_matix(feature_data_reduced_v2)

In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, precision_recall_curve, PrecisionRecallDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
import xgboost as xgb 
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from joblib import dump, load 
from datetime import datetime


print('reading_csv...')
df = pd.read_csv("features_20250725.csv", header=0)
df['is_ad'] = (df['target'] == 'ad').astype(int)

correlated_features = ['spectral_rolloff_85_mean', 'chroma_var', 'amplitude_mean', 'amplitude_std', 'spectral_rolloff_95_mean', 'dynamic_range', 'spectral_rolloff_mean']
correlated_features_v2 = ['poly_features_mean', 'zcr_std', 'beats_per_second', 'poly_features_std', 'amplitude_min', 'mfcc_2_mean', 'spectral_flatness_std', 'rms_std']

non_features = ['audio_file','transcript_id','start_time','end_time','duration','confidence','target']
features_to_drop = correlated_features + correlated_features_v2

df.drop(non_features, axis=1, inplace=True)
df.drop(features_to_drop, axis=1, inplace=True)

df['tempo'] = df['tempo'].str.extract(r'(\d+\.?\d*)').astype(float)

df_targets = df[df['is_ad'] == 1].copy()
df_non_targets = df[df['is_ad'] == 0].sample(n=len(df_targets)*4).copy()

df_model = pd.concat([df_targets, df_non_targets], axis=0)

print('splitting into X and y')
y = df_model['is_ad']

X = df_model.drop(['is_ad'], axis=1, inplace=False)

print(f"{len(X)} samples in loaded dataset")
print(f"{sum(y)} ads; {len(y) - sum(y)} content segments")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

reading_csv...
splitting into X and y
1815 samples in loaded dataset
363 ads; 1452 content segments


In [32]:
# Find max recall where precision >= 95%
target_precision = 0.85
valid_indices = np.where(precision >= target_precision)[0]

if len(valid_indices) > 0:
    # Get the index with maximum recall among valid precision values
    best_idx = valid_indices[np.argmax(recall[valid_indices])]
    
    optimal_threshold = thresholds[best_idx]
    best_precision = precision[best_idx]
    best_recall = recall[best_idx]
    
    print(f"Optimal threshold: {optimal_threshold:.3f}")
    print(f"Precision: {best_precision:.3f}")
    print(f"Recall: {best_recall:.3f}")
    print(f"This means: {best_recall:.1%} of ads will be caught with {best_precision:.1%} precision")
    
else:
    print("No threshold achieves 95% precision")
    # Show the maximum precision achievable
    max_precision = np.max(precision)
    print(f"Maximum achievable precision: {max_precision:.3f}")

Optimal threshold: 0.210
Precision: 0.853
Recall: 0.795
This means: 79.5% of ads will be caught with 85.3% precision


In [28]:
valid_indices = np.where(precision >= 0.9)[0]

argmax_idx = np.argmax(recall[valid_indices])

print(argmax_idx)

0
