# Data preprocessing for the Zindi/GIZ project

# Initialization



In [None]:
from google.colab import files

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

## Install and load libraries

In [None]:
!pip install -q fastai fastcore --upgrade # Make sure we have the new version
!pip install -q python_speech_features
!pip install python_speech_features

[K     |████████████████████████████████| 194kB 6.8MB/s 
[K     |████████████████████████████████| 51kB 6.5MB/s 
[?25h  Building wheel for python-speech-features (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import IPython.display as ipd
import librosa
import python_speech_features as psf
from matplotlib import pyplot as plt
import numpy as np
from os import listdir
from os.path import isfile, join
import os

## Read the data

In [None]:
# Copy the files in and unzip
# !cp 'drive/My Drive/audio_files.zip' audio_files2.zip
# !unzip -q audio_files2.zip

Files:


*   train_full.csv: List of all audio files with their path ('fn'), some of them labeled



In [None]:
# Load the new "train_full.csv" data
train_full = pd.read_csv("/content/drive/My Drive/ZINDI/train_full.csv")

# Adjust name
train_full['fn'] = train_full['fn'].str.replace('audio_train/', '/content/drive/My Drive/ZINDI/data/audio_train/')
train_full['fn'] = train_full['fn'].str.replace('audio_files/', '/content/drive/My Drive/ZINDI/data/audio_train/')
train_full.head()

Unnamed: 0,fn,label
0,/content/drive/My Drive/ZINDI/data/audio_train...,akawuka
1,/content/drive/My Drive/ZINDI/data/audio_train...,banana
2,/content/drive/My Drive/ZINDI/data/audio_train...,obulwadde
3,/content/drive/My Drive/ZINDI/data/audio_train...,nnyaanya
4,/content/drive/My Drive/ZINDI/data/audio_train...,pampu


Load the old train data

In [None]:
train1 = pd.read_csv("/content/drive/My Drive/ZINDI/Train.csv")

# Adjust name
train1['fn'] = train1['fn'].str.replace('audio_files/', '/content/drive/My Drive/ZINDI/data/audio_train/')
train1.head()

Unnamed: 0,fn,label
0,/content/drive/My Drive/ZINDI/data/audio_train...,akawuka
1,/content/drive/My Drive/ZINDI/data/audio_train...,banana
2,/content/drive/My Drive/ZINDI/data/audio_train...,obulwadde
3,/content/drive/My Drive/ZINDI/data/audio_train...,nnyaanya
4,/content/drive/My Drive/ZINDI/data/audio_train...,pampu


Add the new train data

In [None]:
train2 = pd.read_csv('/content/drive/My Drive/ZINDI/labels_new.csv')
train2.head(20)

# Adjust name
train2['fn'] = train2['fn'].str.replace('audio_files2/', '/content/drive/My Drive/ZINDI/data/audio_train/')
train2.head()

Unnamed: 0,fn,label
0,/content/drive/My Drive/ZINDI/data/audio_train...,abalimi
1,/content/drive/My Drive/ZINDI/data/audio_train...,abalimi
2,/content/drive/My Drive/ZINDI/data/audio_train...,abalimi
3,/content/drive/My Drive/ZINDI/data/audio_train...,abalimi
4,/content/drive/My Drive/ZINDI/data/audio_train...,abalimi


Combine both into one long list

In [None]:
train = train1.append(train2)
train.head()

Unnamed: 0,fn,label
0,/content/drive/My Drive/ZINDI/data/audio_train...,akawuka
1,/content/drive/My Drive/ZINDI/data/audio_train...,banana
2,/content/drive/My Drive/ZINDI/data/audio_train...,obulwadde
3,/content/drive/My Drive/ZINDI/data/audio_train...,nnyaanya
4,/content/drive/My Drive/ZINDI/data/audio_train...,pampu


Load the test data

In [None]:
test = pd.read_csv('/content/drive/My Drive/ZINDI/SampleSubmission.csv')
test.head()

# Adjust name
test['fn'] = test['fn'].str.replace('audio_files/', '/content/drive/My Drive/ZINDI/data_new_order/audio_test/')
test.head()

Unnamed: 0,fn,maize streak virus,disease,okukkoola,muwogo,mpeke,mucungwa,greens,garden,mango,bulimi,obuwuka,ebikoola,obulimi,ebisoolisooli,kaamulali,eddagala,beans,omuyembe,leaf,kisaanyi,leaves,butterfly,okuzifuuyira,micungwa,ppaapaali,emboga,kikolo,harvest,olusuku,coffee,super grow,rice,ensujju,okulima,worm,ebbugga,onion,ensigo,plantation,...,ejjobyo,omulimi,okusimba,sweet potatoes,okufuuyira,farming instructor,nnasale beedi,passion fruit,ekitooke,ebisaanyi,ekyeya,enva endiirwa,emisiri,emiyembe,amatooke,ebiwuka,farm,ebinyebwa,amappapaali,ebimera,kassooli,harvesting,emmwanyi,akamonde,obumonde,cabbages,akasaanyi,spread,ebirime,drought,kasaanyi,suckers,insects,fertilizer,nakavundira,ekiwojjolo,akawuka,ddagala,ebiwojjolo,obutungulu
0,/content/drive/My Drive/ZINDI/data_new_order/a...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,/content/drive/My Drive/ZINDI/data_new_order/a...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,/content/drive/My Drive/ZINDI/data_new_order/a...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,/content/drive/My Drive/ZINDI/data_new_order/a...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,/content/drive/My Drive/ZINDI/data_new_order/a...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Load the augmented data

In [None]:
augmented = pd.read_csv('/content/drive/My Drive/ZINDI/train_augmented_data.csv')
augmented.head()
# Adjust name
augmented['fn'] = augmented['fn'].str.replace('audio_train/', '/content/drive/My Drive/ZINDI/data/audio_augmented/')
augmented.head()


Unnamed: 0.1,Unnamed: 0,fn,label
0,1,/content/drive/My Drive/ZINDI/data/audio_augme...,akawuka
1,2,/content/drive/My Drive/ZINDI/data/audio_augme...,banana
2,3,/content/drive/My Drive/ZINDI/data/audio_augme...,obulwadde
3,4,/content/drive/My Drive/ZINDI/data/audio_augme...,nnyaanya
4,5,/content/drive/My Drive/ZINDI/data/audio_augme...,pampu


# Get spectrograms

In [None]:
# Define the function

def im_from_audio(fn, sample_rate=44100, window_length=0.05, window_step=0.0045, NFFT=2205):
  
  # Load the audio into an array (signal) at the specified sample rate
  signal, sr = librosa.load(fn, sr=sample_rate)

  # preemphasis
  signal = psf.sigproc.preemphasis(signal, coeff=0.95)

  # get specrogram
  # Get the frames
  frames = psf.sigproc.framesig(signal, 
                                  window_length*sample_rate, 
                                  window_step*sample_rate, 
                                  lambda x:np.ones((x,)))        # Window function 
    
  # magnitude Spectrogram
  spectrogram = np.rot90(psf.sigproc.magspec(frames, NFFT))
  
  # get rid of high frequencies
  spectrogram = spectrogram[512:,:]

  # normalize in [0, 1]
  spectrogram -= spectrogram.min(axis=None)
  spectrogram /= spectrogram.max(axis=None)        

  # Clip to max 512, 512
  spectrogram = spectrogram[:512, :512]
  
  return spectrogram 


1) Apply the function to the full_train data

In [None]:
im_from_audio(train_full.fn.sample().values[0])

In [None]:
# Load all files and transform them into spectograms
for i in train_full.fn:
  interim1 = i.replace('.wav', '')
  interim2 = interim1.replace('/content/drive/My Drive/ZINDI/data/audio_train/', '') 
  name = '/content/drive/My Drive/ZINDI/data/spectrogram_train/' + interim2 + '-spectrogram.png'
  plt.imsave(fname = name, arr = im_from_audio(i))

Check if it worked:

In [None]:
# Access the file names in "spectogram-train"
mypath = "/content/drive/My Drive/ZINDI/data/spectrogram_train/"
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

# Check for length
len(files) # 4709 files in the spectogram-train folder

4709

2) Apply the function to the test (validation) data

In [None]:
# Load all files and transform them into spectograms
for i in test.fn:
  interim1 = i.replace('.wav', '')
  interim2 = interim1.replace('/content/drive/My Drive/ZINDI/data_new_order/audio_test/', '') 
  name = '/content/drive/My Drive/ZINDI/data_new_order/spectrogram_test/' + interim2 + '-spectrogram.png'
  plt.imsave(fname = name, arr = im_from_audio(i))

Check if it worked:


In [None]:
# Access the file names in "spectogram-test"
mypath = "/content/drive/My Drive/ZINDI/data_new_order/spectrogram_test/"
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

# Check for length
len(files) # 1017 files in the spectogram-test folder

1017

3) Apply the function to the augmented data

In [None]:
# Load all files and transform them into spectograms
for i in augmented.fn:
  interim1 = i.replace('.wav', '')
  interim2 = interim1.replace('/content/drive/My Drive/ZINDI/data/audio_augmented/', '') 
  name = '/content/drive/My Drive/ZINDI/data/spectrogram_augmented/' + interim2 + '-spectrogram.png'
  plt.imsave(fname = name, arr = im_from_audio(i))

## Validate the number of spectrograms

We first thought that there were some files missing in the test data spectrograms -- but there weren't. Here's just the code that we used for validation.

In [None]:
# Check for the missing names in the validation (test) data set

# Access the file names in "spectogram-test"
mypath = "/content/drive/My Drive/spectogram-test"
files = [f for f in os.listdir(mypath) if isfile(join(mypath, f))]

# Check for length
len(files) # 1017 files in the spectogram-test folder

# Load test data
test = pd.read_csv('drive/My Drive/SampleSubmission.csv')

# Check for the names in the validation (test) data set
# 1) Generate interim name where we replace .wav with ''
interim1 = test['fn'].str.replace('.wav', '')
# 2) Generate interim where we then replace "audio_files/" with '' 
# (I couldn't bring it into one single line of code...)
interim2 = interim1.str.replace('audio_files/', '') 
# Combine both interim2 with "spectogram.png" 
# (to make both name similar to "files")
name = interim2 + "-spectogram.png"

# Check for length
len(name) # 1017 names in the validation (test) data set