<a href="https://colab.research.google.com/github/ganeshgaiy/Robocall-classification/blob/main/robocall_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install librosa



In [3]:
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [4]:
import librosa
import numpy as np

def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)

    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs, axis=1)

    # Calculate suitable fmin and n_bands for spectral contrast
    nyquist = sr / 2
    fmin = 200.0  # Starting frequency for the first band
    n_bands = int(np.floor(np.log2(nyquist / fmin)))  # Number of octaves within Nyquist limit

    # Extract Spectral Contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, fmin=fmin, n_bands=n_bands)
    spectral_contrast_mean = np.mean(spectral_contrast, axis=1)

    # Extract Chroma Features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)

    # Combine features
    features = np.hstack([mfccs_mean, spectral_contrast_mean, chroma_mean])

    return features


In [5]:
print(os.getcwd())


/content


In [None]:
# Directories containing audio files
robocall_dir = '/content/drive/MyDrive/Colab Notebooks/robocall_dataset/dataset/robocall'
normal_call_dir = '/content/drive/MyDrive/Colab Notebooks/robocall_dataset/dataset/normal_call'

# Initialize lists to hold features and labels
features_list = []
labels_list = []

# Process robocall files
for file_name in os.listdir(robocall_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(robocall_dir, file_name)
        features = extract_features(file_path)
        features_list.append(features)
        labels_list.append(1)  # Label for robocall

# Process normal call files
for file_name in os.listdir(normal_call_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(normal_call_dir, file_name)
        features = extract_features(file_path)
        features_list.append(features)
        labels_list.append(0)  # Label for normal call

# Create DataFrame
df = pd.DataFrame(features_list)
df['label'] = labels_list

# Save to CSV
df.to_csv('/content/drive/MyDrive/Colab Notebooks/robocall_dataset/audio_features.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/robocall_dataset/audio_features.csv')



In [None]:
df['label'].value_counts()

In [None]:
print(df)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

# Separate features and labels
X = df.drop(columns=['label'])
y = df['label']
# Shuffle the dataset
X_shuffled, y_shuffled = shuffle(X, y, random_state=42)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)





In [None]:
print(y_train)

In [None]:
# with pd.option_context('display.max_rows', None):
#     print(y)

In [None]:
# Define a pipeline with imputation and the classifier
def create_pipeline(classifier):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
        ('classifier', classifier)
    ])


In [None]:
# Initialize classifiers
classifiers = {
    'Support Vector Machine': SVC(random_state=42)
}

# Train and evaluate each classifier
for name, classifier in classifiers.items():
    pipeline = create_pipeline(classifier)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(y_pred)
    print(f"{name} Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))
    print("-" * 60)

In [None]:
from sklearn.model_selection import cross_val_score

# Define the StratifiedKFold cross-validator with shuffling
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for name, classifier in classifiers.items():
    pipeline = create_pipeline(classifier)
    cv_scores = cross_val_score(pipeline, X, y, cv=cv)
    print(f"{name} Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None):
    if cv is None:
        cv = StratifiedKFold(n_splits=5)
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=np.linspace(0.1, 1.0, 5)
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)

    plt.figure()
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

for name, classifier in classifiers.items():
    pipeline = create_pipeline(classifier)
    plot_learning_curve(pipeline, f"{name} Learning Curve", X, y, cv=10)


In [None]:
!pip install opensmile

In [None]:
import opensmile
def ldd_extract_features(file_path):
  smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
  )
  features = smile.process_file(file_path)

In [None]:
# Directories containing audio files
robocall_dir = '/content/drive/MyDrive/Colab Notebooks/robocall_dataset/dataset/robocall'
normal_call_dir = '/content/drive/MyDrive/Colab Notebooks/robocall_dataset/dataset/normal_call'

# Initialize lists to hold features and labels
ldd_features_list = []
ldd_labels_list = []

# Process robocall files
for file_name in os.listdir(robocall_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(robocall_dir, file_name)
        features = ldd_extract_features(file_path)
        ldd_features_list.append(features)
        ldd_labels_list.append(1)  # Label for robocall

# Process normal call files
for file_name in os.listdir(normal_call_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(normal_call_dir, file_name)
        features = ldd_extract_features(file_path)
        ldd_features_list.append(features)
        ldd_labels_list.append(0)  # Label for normal call

# Create DataFrame
df = pd.DataFrame(ldd_features_list)
df['label'] = ldd_labels_list

# Save to CSV
df.to_csv('/content/drive/MyDrive/Colab Notebooks/robocall_dataset/ldd_audio_features.csv', index=False)
