In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shl-dataset/dataset/sample_submission.csv
/kaggle/input/shl-dataset/dataset/train.csv
/kaggle/input/shl-dataset/dataset/test.csv
/kaggle/input/shl-dataset/dataset/audios_test/audio_885.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_698.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_1176.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_1215.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_66.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_386.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_1026.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_330.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_72.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_858.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_107.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_820.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_300.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_435.wav
/kaggle/input/sh

In [7]:
!pip install transformers torchaudio pandas numpy xgboost scikit-learn matplotlib



In [8]:
import pandas as pd
import numpy as np
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr
import xgboost as xgb
import matplotlib.pyplot as plt
import os

In [9]:
!git clone https://huggingface.co/facebook/wav2vec2-base-960h

Cloning into 'wav2vec2-base-960h'...
remote: Enumerating objects: 118, done.[K
remote: Total 118 (delta 0), reused 0 (delta 0), pack-reused 118 (from 1)[K
Receiving objects: 100% (118/118), 15.08 KiB | 7.54 MiB/s, done.
Resolving deltas: 100% (67/67), done.
Filtering content: 100% (3/3), 1.05 GiB | 135.09 MiB/s, done.


In [10]:
MODEL_PATH = "/kaggle/working/wav2vec2-base-960h"

In [11]:
TRAIN_AUDIO_DIR = "/kaggle/input/shl-dataset/dataset/audios_train"
TEST_AUDIO_DIR = "/kaggle/input/shl-dataset/dataset/audios_test"

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
train_df = pd.read_csv('/kaggle/input/shl-dataset/dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/shl-dataset/dataset/test.csv')

In [14]:
print("Train data sample:\n", train_df.head())
print("\nTest data sample:\n", test_df.head())

Train data sample:
          filename  label
0  audio_1261.wav    1.0
1   audio_942.wav    1.5
2  audio_1110.wav    1.5
3  audio_1024.wav    1.5
4   audio_538.wav    2.0

Test data sample:
          filename
0   audio_706.wav
1   audio_800.wav
2    audio_68.wav
3  audio_1267.wav
4   audio_683.wav


In [15]:
try:
    processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH)
    model = Wav2Vec2Model.from_pretrained(MODEL_PATH).to(device)
    print("\nModel loaded successfully from local files")
except Exception as e:
    print(f"\nError loading model: {str(e)}")
    raise

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at /kaggle/working/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model loaded successfully from local files


In [16]:
def process_audio(audio_path):
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
            
        speech = waveform.numpy().squeeze()
        inputs = processor(
            speech, 
            sampling_rate=16000, 
            return_tensors="pt", 
            padding=True
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            
        return torch.mean(outputs.last_hidden_state, dim=1).squeeze().cpu().numpy()
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        return None

In [19]:
def extract_features(df, audio_dir):
    features = []
    valid_indices = []
    
    for idx, row in df.iterrows():
        audio_path = os.path.join(audio_dir, row['filename'])
        if not os.path.exists(audio_path):
            print(f"Missing file: {audio_path}")
            continue
            
        emb = process_audio(audio_path)
        if emb is not None and emb.shape == (768,):
            features.append(emb)
            valid_indices.append(idx)
        elif emb is not None:
            print(f"Unexpected shape {emb.shape} for {audio_path}")
    
    return np.array(features), df.iloc[valid_indices]['label'].values


In [20]:
print("Extracting training features...")
X_train, y_train = extract_features(train_df, TRAIN_AUDIO_DIR)
print(f"Retained {len(X_train)}/{len(train_df)} training samples")

Extracting training features...
Unexpected shape (2, 768) for /kaggle/input/shl-dataset/dataset/audios_train/audio_1069.wav
Unexpected shape (2, 768) for /kaggle/input/shl-dataset/dataset/audios_train/audio_142.wav
Unexpected shape (2, 768) for /kaggle/input/shl-dataset/dataset/audios_train/audio_265.wav
Unexpected shape (2, 768) for /kaggle/input/shl-dataset/dataset/audios_train/audio_978.wav
Retained 440/444 training samples


In [29]:
def extract_features1(df, audio_dir, is_test=False):
    features = []
    valid_files = []
    
    for _, row in df.iterrows():
        audio_path = os.path.join(audio_dir, row['filename'])
        if not os.path.exists(audio_path):
            print(f"Missing file: {audio_path}")
            continue
            
        emb = process_audio(audio_path)
        if emb is not None and emb.shape == (768,):
            features.append(emb)
            valid_files.append(row['filename'])
        elif emb is not None:
            print(f"Unexpected shape {emb.shape} for {audio_path}")
    
    if not is_test:
        # For training data: return features and corresponding scores
        valid_df = df[df['filename'].isin(valid_files)]
        return np.array(features), valid_df['label'].values
    else:
        # For test data: return features and valid filenames
        return np.array(features), valid_files

In [30]:
print("\nExtracting test features...")
X_test, _ = extract_features1(test_df, TEST_AUDIO_DIR, is_test = True)
print(f"Retained {len(X_test)}/{len(test_df)} test samples")


Extracting test features...
Unexpected shape (2, 768) for /kaggle/input/shl-dataset/dataset/audios_test/audio_159.wav
Retained 194/195 test samples


In [31]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [32]:
model_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

In [33]:
model_xgb.fit(
    X_train_split, y_train_split,
    eval_set=[(X_val_split, y_val_split)],
    early_stopping_rounds=20,
    verbose=10
)

[0]	validation_0-rmse:1.13688




[10]	validation_0-rmse:0.98159
[20]	validation_0-rmse:0.91270
[30]	validation_0-rmse:0.86726
[40]	validation_0-rmse:0.84400
[50]	validation_0-rmse:0.83291
[60]	validation_0-rmse:0.82604
[70]	validation_0-rmse:0.82064
[80]	validation_0-rmse:0.81635
[90]	validation_0-rmse:0.81551
[100]	validation_0-rmse:0.81200
[110]	validation_0-rmse:0.81134
[120]	validation_0-rmse:0.81002
[130]	validation_0-rmse:0.80943
[140]	validation_0-rmse:0.80884
[150]	validation_0-rmse:0.80865
[160]	validation_0-rmse:0.80850
[170]	validation_0-rmse:0.80844
[180]	validation_0-rmse:0.80831
[190]	validation_0-rmse:0.80821
[200]	validation_0-rmse:0.80834
[210]	validation_0-rmse:0.80823
[211]	validation_0-rmse:0.80820


In [35]:
y_pred_val = model_xgb.predict(X_val_split)
print("\nValidation Metrics:")
print(f"Pearson Correlation: {pearsonr(y_val_split, y_pred_val)[0]:.3f}")
print(f"MAE: {mean_absolute_error(y_val_split, y_pred_val):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_val_split, y_pred_val)):.3f}")
print(f"R²: {r2_score(y_val_split, y_pred_val):.3f}")


Validation Metrics:
Pearson Correlation: 0.726
MAE: 0.641
RMSE: 0.808
R²: 0.496


In [36]:
model_xgb_full = xgb.XGBRegressor(**model_xgb.get_params())
model_xgb_full.fit(X_train, y_train)
test_pred = model_xgb_full.predict(X_test)

In [114]:
test_pred

array([2.283736 , 3.077507 , 3.8344913, 2.8103576, 2.9851973, 3.1439705,
       2.805294 , 3.0098987, 3.809254 , 2.5333908, 4.0089593, 2.8711233,
       2.2458236, 2.7803092, 2.5848272, 3.8524003, 3.2458344, 2.5956657,
       2.6441472, 2.6666782, 2.9051847, 2.3541162, 2.9529095, 2.6588418,
       3.710727 , 3.7095172, 3.0199506, 2.4289646, 2.540497 , 2.2650635,
       2.7567043, 3.5214188, 3.2258775, 2.6169264, 3.172329 , 2.6633925,
       3.3045886, 2.6677783, 3.5227845, 2.8805273, 3.0557137, 3.1138039,
       3.3101223, 2.7968438, 3.0654714, 2.9360442, 2.840825 , 3.3632696,
       3.6854875, 4.07958  , 4.295764 , 3.7351675, 3.2002401, 2.3688438,
       2.9930165, 3.0834892, 2.9392698, 3.913046 , 2.7459004, 2.9083862,
       3.821058 , 4.2471094, 2.848333 , 4.030991 , 2.8120751, 2.4466026,
       3.8947072, 2.614863 , 2.6794713, 3.3246212, 2.8015692, 3.6482399,
       3.1244862, 3.7203999, 2.407901 , 2.9415426, 3.3674989, 3.182699 ,
       2.658993 , 3.2915146, 3.020752 , 3.1854072, 

In [173]:
submission_df

Unnamed: 0,filename,label
0,audio_706.wav,2.283736
1,audio_800.wav,3.077507
2,audio_68.wav,3.834491
3,audio_1267.wav,2.810358
4,audio_683.wav,2.985197
...,...,...
190,audio_135.wav,4.295243
191,audio_512.wav,4.231444
192,audio_529.wav,3.930158
193,audio_762.wav,4.750873


In [174]:
print(submission_df[submission_df['filename'] == 'audio_159.wav'])

          filename     label
111  audio_159.wav  3.880862


In [182]:
submission_df.to_csv("/kaggle/working/submission8.csv", index=False)

In [127]:
print(sf[sf['filename'] == 'audio_379.wav'])

          filename
194  audio_379.wav


In [128]:
def process_single_audio(audio_path):
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
        
        speech = waveform.numpy().squeeze()
        inputs = processor(
            speech, 
            sampling_rate=16000, 
            return_tensors="pt", 
            padding=True
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze().cpu().numpy()
        return embeddings
    
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        return None

In [134]:
suf = pd.read_csv('/kaggle/input/shl-dataset/dataset/test.csv')

In [135]:
print("Before processing:")
print(suf[suf['filename'] == 'audio_379.wav'])

Before processing:
          filename
194  audio_379.wav


In [136]:
audio_path = os.path.join(TEST_AUDIO_DIR, 'audio_379.wav')

In [137]:
if os.path.exists(audio_path):
    # Get embeddings
    embeddings = process_single_audio(audio_path)
    
    if embeddings is not None and embeddings.shape == (768,):
        # Predict score
        new_score = model_xgb_full.predict([embeddings])[0]
        
        # Update submission
        suf.loc[
            suf['filename'] == 'audio_379.wav',
            'label'
        ] = new_score
        
        print("\nAfter processing:")
        print(suf[suf['filename'] == 'audio_379.wav'])
    else:
        print("\nFailed to process audio_379.wav - using existing score")
else:
    print(f"\nFile not found: {audio_path}")


After processing:
          filename     label
194  audio_379.wav  4.750873


In [138]:
suf

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,filename,label
0,audio_706.wav,
1,audio_800.wav,
2,audio_68.wav,
3,audio_1267.wav,
4,audio_683.wav,
...,...,...
190,audio_135.wav,
191,audio_512.wav,
192,audio_529.wav,
193,audio_762.wav,


In [148]:
submission_df = submission_df.merge(
    pd.read_csv('/kaggle/input/shl-dataset/dataset/sample_submission.csv')[['filename']],
    on='filename',
    how='right'
).fillna(0)

In [180]:
submission_df.loc[submission_df['filename'] == 'audio_379.wav', 'label'] = suf['label'][len(suf) - 1]

In [181]:
submission_df

Unnamed: 0,filename,label
0,audio_706.wav,2.283736
1,audio_800.wav,3.077507
2,audio_68.wav,3.834491
3,audio_1267.wav,2.810358
4,audio_683.wav,2.985197
...,...,...
190,audio_135.wav,4.295243
191,audio_512.wav,4.231444
192,audio_529.wav,3.930158
193,audio_762.wav,4.750873


In [177]:
print(submission_df[submission_df['filename'] == 'audio_159.wav'])

          filename     label
111  audio_159.wav  3.880862


In [178]:
submission_df['label'][111]

3.880862