<a href="https://colab.research.google.com/github/crocodile27/AccentClassificationDL/blob/main/Speech_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Download Dataset**

In [None]:
# Install Kaggle API
!pip install kaggle -q

# Download the dataset
!kaggle datasets download -d rtatman/speech-accent-archive

# Unzip the dataset
import zipfile

with zipfile.ZipFile('speech-accent-archive.zip', 'r') as zip_ref:
    zip_ref.extractall('dataset')

Dataset URL: https://www.kaggle.com/datasets/rtatman/speech-accent-archive
License(s): CC-BY-NC-SA-4.0
speech-accent-archive.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  speech-accent-archive.zip
replace reading-passage.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# **Data Exploration**

In [None]:
import os

# Read the recordings file
recordings_dir = '/content/dataset/recordings/recordings'
recordings = os.listdir(recordings_dir)
print(recordings[:10])

['arabic14.mp3', 'english427.mp3', 'turkish22.mp3', 'frisian1.mp3', 'twi5.mp3', 'english436.mp3', 'english439.mp3', 'english71.mp3', 'belarusan1.mp3', 'romanian7.mp3']


In [None]:
# Read the reading-passage.txt file
with open('dataset/reading-passage.txt', 'r') as file:
    reading_passage = file.read()

print(reading_passage)

Please call Stella.  Ask her to bring these things with her from the store:  Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob.  We also need a small plastic snake and a big toy frog for the kids.  She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.



In [None]:
import pandas as pd

# Read the speakers_all.csv file
speakers_df = pd.read_csv('dataset/speakers_all.csv')
print(speakers_df.head())

    age  age_onset              birthplace  filename native_language   sex  \
0  24.0       12.0         koussi, senegal   balanta         balanta  male   
1  18.0       10.0          buea, cameroon  cameroon        cameroon  male   
2  48.0        8.0  hong, adamawa, nigeria  fulfulde        fulfulde  male   
3  42.0       42.0   port-au-prince, haiti   haitian         haitian  male   
4  40.0       35.0   port-au-prince, haiti   haitian         haitian  male   

   speakerid   country  file_missing?  Unnamed: 9  Unnamed: 10 Unnamed: 11  
0        788   senegal           True         NaN          NaN         NaN  
1       1953  cameroon           True         NaN          NaN         NaN  
2       1037   nigeria           True         NaN          NaN         NaN  
3       1165     haiti           True         NaN          NaN         NaN  
4       1166     haiti           True         NaN          NaN         NaN  


In [None]:
from IPython.display import Audio

# Select a recording to play
recording_file = os.path.join(recordings_dir, recordings[0])
Audio(recording_file)

### **Filter the data to find the top accents**

In [None]:
# Count the number of samples for each native language (accent)
accent_counts = speakers_df['native_language'].value_counts()

print(accent_counts)

native_language
english     579
spanish     162
arabic      102
mandarin     65
french       63
           ... 
kalanga       1
kabyle        1
jola          1
irish         1
zulu          1
Name: count, Length: 214, dtype: int64


In [None]:
# Choose the top accents with enough samples
top_accents = accent_counts[accent_counts > 25]
print(top_accents)

native_language
english       579
spanish       162
arabic        102
mandarin       65
french         63
korean         52
portuguese     48
russian        48
dutch          47
turkish        37
german         36
polish         34
italian        33
japanese       27
macedonian     26
Name: count, dtype: int64


In [None]:
# Filter the dataframe to include only the top accents
filtered_speakers_df = speakers_df[speakers_df['native_language'].isin(top_accents.index)]
columns_to_drop = ['file_missing?','Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']
speakers_df = speakers_df.drop(columns=columns_to_drop)
print(speakers_df.head())

    age  age_onset              birthplace  filename native_language   sex  \
0  24.0       12.0         koussi, senegal   balanta         balanta  male   
1  18.0       10.0          buea, cameroon  cameroon        cameroon  male   
2  48.0        8.0  hong, adamawa, nigeria  fulfulde        fulfulde  male   
3  42.0       42.0   port-au-prince, haiti   haitian         haitian  male   
4  40.0       35.0   port-au-prince, haiti   haitian         haitian  male   

   speakerid   country  
0        788   senegal  
1       1953  cameroon  
2       1037   nigeria  
3       1165     haiti  
4       1166     haiti  


In [None]:
# Get the list of recordings corresponding to the top accents
filtered_recordings = filtered_speakers_df['filename'].apply(lambda x: os.path.join(recordings_dir, x))

print(filtered_recordings.head())

70      /content/dataset/recordings/recordings/arabic1
71     /content/dataset/recordings/recordings/arabic10
72    /content/dataset/recordings/recordings/arabic100
73    /content/dataset/recordings/recordings/arabic101
74    /content/dataset/recordings/recordings/arabic102
Name: filename, dtype: object


In [None]:
from IPython.display import Audio

# Play an audio recording from the filtered list
if not filtered_recordings.empty:
    recording_file = os.path.join(recordings_dir, recordings[4])
    display(Audio(recording_file))
else:
    print("No recordings found for the top accents.")

# **Vectorizing Audio files**

In [None]:
pip install -U flash-attn --no-build-isolation

Collecting flash-attn
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash-attn)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->flash-attn)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->flash-attn)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->flash-attn)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->flash-attn)
  

###mp3 to wav


In [None]:
!pip install pydub


In [None]:
from pydub import AudioSegment

def convert_mp4_to_wav(mp3_file, wav_file):
    audio = AudioSegment.from_mp3(mp3_file)
    audio.export(wav_file, format="wav")


In [None]:
from transformers import Wav2Vec2Config, Wav2Vec2Model

# Initializing a Wav2Vec2 facebook/wav2vec2-base-960h style configuration
configuration = Wav2Vec2Config()

# Initializing a model (with random weights) from the facebook/wav2vec2-base-960h style configuration
model = Wav2Vec2Model(configuration)

# Accessing the model configuration
configuration = model.config

### Extract features

In [None]:
import torch

# Load pre-trained wav2vec model
wav2vec_model = Wav2VecModel.from_pretrained('path/to/pretrained/model')

def extract_features(wav_file):
    waveform, sample_rate = torchaudio.load(wav_file)
    with torch.no_grad():
        features = wav2vec_model(waveform)
    return features


# Modeling

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt

# Load the data
train_data = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_data = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

# Create the data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, shuffle=False)

# Define the model
model = torchvision.models.resnet18(pretrained=True)
model.fc = torch.nn.Linear(512, 10)

# Define the loss function and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

# Train the model
epochs = 10
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Forward pass
        predictions = model(images)
        loss = criterion(predictions, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print the loss
        if i % 100 == 0:
            print('Epoch: {} Loss: {}'.format(epoch, loss.item()))

# Evaluate the model
correct = 0
total = 0
for images, labels in test_loader:
    predictions = model(images)
    predictions = predictions > 0.5
    correct += (predictions == labels).sum().item()
    total += len(labels)

print('Accuracy: {}'.format(correct / total))

# Save the model
torch.save(model.state_dict(), './model.pth')

# Load the model
model = torchvision.models.resnet18(pretrained=False)
model.load_state_dict(torch.load('./model.pth'))

# Predict a single image
image = train_data[0][0]
image = image.unsqueeze(0)
prediction = model(image)
prediction = prediction > 0.5
print('Prediction: {}'.format(prediction))

# Plot the image and the prediction
plt.imshow(image.squeeze())
plt.show()