In [31]:
#%% [code]
# Import required libraries
import os
import warnings
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
RANDOM_STATE = 42


In [33]:
def extract_features(file_path, sr=22050, n_mfcc=13, duration=5):
    """
    Extracts audio features from a file.
    
    Parameters:
      file_path (str): Path to the audio file.
      sr (int): Sampling rate.
      n_mfcc (int): Number of MFCCs to extract.
      duration (float): Duration (in seconds) to load.
    
    Returns:
      np.array: Feature vector containing mean and std of MFCC and chroma features.
                Total features = (n_mfcc*2 + 12*2).
    """
    try:
        # Load audio file (only a defined duration for consistency)
        audio, _ = librosa.load(file_path, sr=sr, duration=duration)
        
        # Extract MFCC features and compute statistics
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std  = np.std(mfccs, axis=1)
        
        # Extract Chroma features and compute statistics
        chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
        chroma_mean = np.mean(chroma, axis=1)
        chroma_std  = np.std(chroma, axis=1)
        
        # Combine features into a single vector
        features = np.concatenate((mfccs_mean, mfccs_std, chroma_mean, chroma_std))
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        # Create a zero vector in case of error [Length: n_mfcc*2 + (12*2)]
        features = np.zeros(n_mfcc*2 + 24)
    return features

# Quick test: Uncomment the following two lines to extract features from a sample file.
# sample_feature = extract_features('audios_train/sample.wav')
# print("Extracted feature vector shape:", sample_feature.shape)


In [35]:
# File paths (adjust these names if needed)
train_csv = "train_file.csv"
test_csv = "test_file.csv"
submission_csv = "sample_submission.csv"

# Directories containing your audio files
audios_train_dir = "audios_train"
audios_test_dir = "audios_test"

# Load CSV files
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)
submission_df = pd.read_csv(submission_csv)

print("Train data samples:", train_df.shape)
print("Test data samples:", test_df.shape)


Train data samples: (444, 2)
Test data samples: (195, 1)


In [49]:
# Split training data for validation
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=RANDOM_STATE
)

# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE)
model.fit(X_tr, y_tr)

# Evaluate on the validation set
y_val_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)


Validation RMSE: 1.05777684505409


In [51]:
# Train final model on full training data
final_model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE)
final_model.fit(X_train, y_train)

# Save the final model for later use (deployment etc.)
joblib.dump(final_model, "grammar_scoring_model.joblib")
print("Final model saved as 'grammar_scoring_model.joblib'")


Final model saved as 'grammar_scoring_model.joblib'


In [53]:
# Generate predictions for test data
predictions = final_model.predict(X_test)

# Here, we assume that the submission file should contain file names and predicted scores.
submission = pd.DataFrame({
    "file_name": test_file_names,
    "score": predictions
})
submission.to_csv("final_submission.csv", index=False)
print("Submission CSV saved as 'final_submission.csv'")


Submission CSV saved as 'final_submission.csv'
