<a href="https://colab.research.google.com/github/ganeshgaiy/Robocall-classification/blob/main/robocall_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install librosa



In [1]:
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
import librosa
import numpy as np

def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)

    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs, axis=1)

    # Calculate suitable fmin and n_bands for spectral contrast
    nyquist = sr / 2
    fmin = 200.0  # Starting frequency for the first band
    n_bands = int(np.floor(np.log2(nyquist / fmin)))  # Number of octaves within Nyquist limit

    # Extract Spectral Contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, fmin=fmin, n_bands=n_bands)
    spectral_contrast_mean = np.mean(spectral_contrast, axis=1)

    # Extract Chroma Features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)

    # Combine features
    features = np.hstack([mfccs_mean, spectral_contrast_mean, chroma_mean])

    return features


In [3]:
print(os.getcwd())


/content


In [4]:
# Directories containing audio files
robocall_dir = '/content/drive/MyDrive/Colab Notebooks/robocall_dataset/dataset/robocall'
normal_call_dir = '/content/drive/MyDrive/Colab Notebooks/robocall_dataset/dataset/normal_call'

# Initialize lists to hold features and labels
features_list = []
labels_list = []

# Process robocall files
for file_name in os.listdir(robocall_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(robocall_dir, file_name)
        features = extract_features(file_path)
        features_list.append(features)
        labels_list.append(1)  # Label for robocall

# Process normal call files
for file_name in os.listdir(normal_call_dir):
    if file_name.endswith('.wav'):
        file_path = os.path.join(normal_call_dir, file_name)
        features = extract_features(file_path)
        features_list.append(features)
        labels_list.append(0)  # Label for normal call

# Create DataFrame
df = pd.DataFrame(features_list)
df['label'] = labels_list

# Save to CSV
df.to_csv('audio_features.csv', index=False)




In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('audio_features.csv')



In [8]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,1343
0,1000


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('audio_features.csv')

# Separate features and labels
X = df.drop(columns=['label'])
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)





In [19]:
# Define a pipeline with imputation and the classifier
def create_pipeline(classifier):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
        ('classifier', classifier)
    ])


In [20]:
# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42)
}

# Train and evaluate each classifier
for name, classifier in classifiers.items():
    pipeline = create_pipeline(classifier)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"{name} Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))
    print("-" * 60)

Logistic Regression Performance:
Accuracy: 0.99
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       200
           1       0.99      0.99      0.99       269

    accuracy                           0.99       469
   macro avg       0.98      0.99      0.98       469
weighted avg       0.99      0.99      0.99       469

------------------------------------------------------------
Random Forest Performance:
Accuracy: 0.99
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       200
           1       1.00      0.98      0.99       269

    accuracy                           0.99       469
   macro avg       0.99      0.99      0.99       469
weighted avg       0.99      0.99      0.99       469

------------------------------------------------------------
Support Vector Machine Performance:
Accuracy: 1.00
              precision    recall  f1-score   support

           0       0.99     

In [21]:
from sklearn.model_selection import cross_val_score

for name, classifier in classifiers.items():
    pipeline = create_pipeline(classifier)
    cv_scores = cross_val_score(pipeline, X, y, cv=5)
    print(f"{name} Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")


Logistic Regression Cross-Validation Accuracy: 0.99 ± 0.00
Random Forest Cross-Validation Accuracy: 0.99 ± 0.01
Support Vector Machine Cross-Validation Accuracy: 0.99 ± 0.01


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2343 entries, 0 to 2342
Data columns (total 32 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       2343 non-null   float64
 1   1       2343 non-null   float64
 2   2       2343 non-null   float64
 3   3       2343 non-null   float64
 4   4       2343 non-null   float64
 5   5       2343 non-null   float64
 6   6       2343 non-null   float64
 7   7       2343 non-null   float64
 8   8       2343 non-null   float64
 9   9       2343 non-null   float64
 10  10      2343 non-null   float64
 11  11      2343 non-null   float64
 12  12      2343 non-null   float64
 13  13      2343 non-null   float64
 14  14      2343 non-null   float64
 15  15      2343 non-null   float64
 16  16      2343 non-null   float64
 17  17      2343 non-null   float64
 18  18      2343 non-null   float64
 19  19      2343 non-null   float64
 20  20      2343 non-null   float64
 21  21      2343 non-null   float64
 22  

In [24]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,label
count,2343.0,2343.0,2343.0,2343.0,2343.0,2343.0,2343.0,2343.0,2343.0,2343.0,...,2343.0,2343.0,2343.0,2343.0,2343.0,2343.0,2343.0,2343.0,1279.0,2343.0
mean,-234.789806,100.173186,-25.613028,28.093008,-14.510391,-8.463459,-1.983148,-12.551267,-5.329242,-5.44997,...,0.345893,0.359811,0.371592,0.379921,0.389419,0.387463,0.384048,0.372127,0.349298,0.573197
std,77.682714,50.62771,23.758542,19.454938,12.399907,9.219945,11.574683,11.952102,8.143353,6.162696,...,0.093589,0.092506,0.09326,0.092602,0.091742,0.090488,0.090009,0.08693,0.084591,0.494719
min,-604.487244,-2.780348,-104.35656,-21.988764,-63.502537,-36.275547,-34.207588,-39.025547,-26.847109,-21.304554,...,0.099494,0.095631,0.115908,0.118544,0.152466,0.09898,0.096048,0.112608,0.090315,0.0
25%,-265.452866,54.231056,-41.961447,12.321279,-22.533933,-14.429836,-11.480419,-23.186339,-11.253987,-9.587296,...,0.288984,0.307834,0.31723,0.321956,0.328106,0.327557,0.319033,0.312738,0.29982,0.0
50%,-228.813934,96.188423,-26.769035,26.771696,-12.114866,-8.753443,0.853725,-12.79384,-5.070365,-5.583089,...,0.347007,0.366358,0.368353,0.369862,0.382678,0.381133,0.381939,0.37105,0.344698,1.0
75%,-182.849663,152.361923,-6.946549,44.963789,-6.052738,-1.984013,7.167053,-2.339484,-0.000659,-1.468124,...,0.406682,0.414847,0.429742,0.439179,0.442865,0.433707,0.433755,0.418121,0.39268,1.0
max,-21.2302,186.02034,35.916218,79.501732,16.865976,18.743698,21.668756,21.532761,18.681391,15.570399,...,0.729313,0.749485,0.772482,0.807577,1.0,0.845441,0.833203,0.778461,0.795198,1.0
