# Getting Started

This notebook will help install all the required dependencies as well as prepare the dataset for use with fast.ai

In [None]:
# Check python version
import sys
sys.version

In [None]:
!apt-get install -y libsndfile1

In [None]:
# Install fastai
!pip3 install fastai kaggle soundfile torchaudio librosa

# Download Dataset

In order to use the Kaggle’s public API, you must first authenticate using an API token. From the site header, click on your user profile picture, then on “My Account” from the dropdown menu. This will take you to your account settings at https://www.kaggle.com/account. Scroll down to the section of the page labelled API:

To create a new token, click on the “Create New API Token” button. This will download a fresh authentication token onto your machine.

### Accept the rules

https://www.kaggle.com/competitions/whale-detection-challenge/rules


### Upload your `kaggle.json` to the same folder as this notebook then run the cell below

In [None]:
!mkdir -p ~/.kaggle; mv kaggle.json ~/.kaggle/kaggle.json

### Download Dataset from Kaggle

In [None]:
!kaggle competitions download -c whale-detection-challenge

### Prepare Dataset for Use

In [None]:
!apt-get install unzip

In [None]:
!rm -rf full_data; rm -rf sample_data; rm -rf tmp_data; #remove any existing extracted data
!unzip -q whale-detection-challenge.zip -d data/ #unzip main file
!unzip -q data/small_data_sample_revised.zip -d sample_data/ #unzip sample data
!unzip -q data/whale_data.zip -d tmp_data/ #unzip full data
!rm -rf data/; rm -rf tmp_data/data/test; #remove unneeded files. official test data isn't used because we don't have labels
!mkdir full_data; mv tmp_data/data/train full_data/audio; #move stuff around
!mv tmp_data/data/train.csv full_data/labels.csv #rename labels
!rm -rf tmp_data #remove tmp directory
!mkdir -p full_data/whale; mkdir -p full_data/not_whale; #create necessary folders

In [None]:
import os
DATA_ROOT_DIR=os.path.normpath(os.path.join(os.getcwd(), 'full_data'))
DATA_META_FILE=os.path.join(DATA_ROOT_DIR, 'labels.csv')
DATA_AUDIO_DIR=os.path.join(DATA_ROOT_DIR, 'audio')
DATA_WHALE_AUDIO_DIR=os.path.join(DATA_ROOT_DIR, 'whale')
DATA_NOT_WHALE_AUDIO_DIR=os.path.join(DATA_ROOT_DIR, 'not_whale')

In [None]:
import shutil
import pandas as pd

df = pd.read_csv(DATA_META_FILE)
df.head()

for index, row in df.iterrows():
    clip_name = row['clip_name']
    label = row['label']
    source_path = os.path.join(DATA_AUDIO_DIR, clip_name)
    not_whale_destination_path = os.path.join(DATA_NOT_WHALE_AUDIO_DIR, clip_name)
    whale_destination_path = os.path.join(DATA_WHALE_AUDIO_DIR, clip_name)
    try:
        if(label==0): #not whale
            shutil.move(source_path, not_whale_destination_path)
        else: #whale
            shutil.move(source_path, whale_destination_path)
    except:
        print(f"Could not move file. Possibly already moved. {source_path}")

In [None]:
!rm -rf full_data/audio; rm -rf full_data/labels.csv