In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/indic-tts-deepfake-challenge/sample.csv


In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("SherryT997/IndicTTS-Deepfake-Challenge-Data")

train_data = dataset["train"]
test_data = dataset["test"]

print(train_data.features)
print(train_data[0])

In [3]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
from transformers import Trainer, TrainingArguments, AutoProcessor, AutoFeatureExtractor, AutoModelForAudioClassification
from datasets import load_dataset, Audio

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [4]:
model_id = "microsoft/wavlm-large"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True, return_attention_mask=True)
model = AutoModelForAudioClassification.from_pretrained(model_id, num_labels=2) 

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of WavLMForSequenceClassification were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
import torch
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
def processing_audio(batch):
    audio = batch["audio"]["array"]
    max_length = 16000
    if len(audio) < max_length:
        audio = np.pad(audio, (0, max_length - len(audio)), mode='constant')
    else:
        audio = audio[:max_length]

    inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt",padding=True)
    batch["input_values"] = inputs.input_values[0]
    batch["labels"] = torch.tensor(batch["is_tts"], dtype=torch.float)

    return batch

train_dataset = train_data.map(processing_audio, remove_columns=["audio", "text", "id", "language", "is_tts"])

Map:   0%|          | 0/31102 [00:00<?, ? examples/s]

In [7]:
torch.save(train_dataset,'train_dataset_wavlm_large.pt')

In [8]:
dataset = train_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

In [9]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [10]:
from transformers import DataCollatorWithPadding
from evaluate import load
from dataclasses import dataclass
from typing import List, Dict, Union

@dataclass


class DataCollatorWithPadding:

    processor: feature_extractor  # Use full processor for consistency
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Extract input features (waveforms)
        input_features = [{"input_values": feature["input_values"]} for feature in features]

        # Apply padding using processor
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # Convert labels to tensor
        batch["labels"] = torch.tensor([feature["labels"] for feature in features], dtype=torch.long)

        return batch

data_collator = DataCollatorWithPadding(feature_extractor, padding=True)

In [11]:
import numpy as np
from evaluate import load
from scipy.special import softmax

accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")
roc_auc_metric = load("roc_auc")


def compute_metrics(eval_pred):
    pred_logits = eval_pred.predictions
    pred_probs = softmax(pred_logits, axis = -1)[:,1]
    labels = eval_pred.label_ids

    accuracy = accuracy_metric.compute(predictions=pred_probs.round(), references = labels)["accuracy"]
    f1 = f1_metric.compute(predictions=pred_probs.round(), references = labels, average="binary")["f1"]
    roc_auc = roc_auc_metric.compute(prediction_scores = pred_probs, references = labels)["roc_auc"]
    
    return { "accuracy": accuracy,"f1": f1, "roc_auc": roc_auc}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [12]:
model.to("cuda")
model.freeze_feature_encoder()

In [13]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="data_classify",
    group_by_length=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=1,
    fp16=True,
    gradient_checkpointing=True,
    save_steps=1000,
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    weight_decay=0.005,
    warmup_steps=1000,
    save_total_limit=3,
    load_best_model_at_end=True,
    save_strategy="steps",
    report_to="none" 
)

# Define trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=feature_extractor, 
)
trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
500,0.5999,0.432057,0.809065,0.774144,0.956662
1000,0.3113,0.318066,0.91964,0.915825,0.98724
1500,0.2389,0.152352,0.965606,0.965804,0.994087
2000,0.1656,0.049796,0.984571,0.984896,0.998668
2500,0.1267,0.181047,0.956927,0.956522,0.998646
3000,0.079,0.079921,0.981035,0.981311,0.998729


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=3499, training_loss=0.22596901203775854, metrics={'train_runtime': 1745.6619, 'train_samples_per_second': 16.035, 'train_steps_per_second': 2.004, 'total_flos': 8.48358162602304e+17, 'train_loss': 0.22596901203775854, 'epoch': 1.0})

In [14]:
import pandas as pd
from tqdm import tqdm

model.eval()
ids = []
probs = []

for batch in tqdm(test_data):
    id = batch["id"]
    audio = batch["audio"]["array"]
    inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt",padding=True)
    
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prob = F.softmax(logits, dim=1)
        #prob_fake = prob[:,1].item()
        prob_synth = prob[:,1].item()
        ids.append(id)
        probs.append(prob_synth)
    

df = pd.DataFrame({"id": ids, "is_tts" : probs})

100%|██████████| 2635/2635 [03:38<00:00, 12.08it/s]


In [15]:
df.to_csv("/kaggle/working/submission.csv", index=False)