## Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pwd

/Users/Charlotte/code/charlottesuaud/birds/raw_data


## Loading origin metadata.csv

In [3]:
metadata = pd.read_csv("metadata.csv")
metadata.head(3)

Unnamed: 0,Recording_ID,Genus,Specific_epithet,Subspecies,English_name,Recordist,Country,Locality,Latitude,Longitude,...,Other_species23,Other_species24,Other_species25,Other_species26,Other_species27,Other_species28,Other_species29,Other_species30,Species,Path
0,356824,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"doñana visitable, sevilla, andalucía",37.1058,-6.2577,...,,,,,,,,,Sonus naturalis,mp3//Sonus-naturalis-356824.mp3
1,317951,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"arroyo algarbe, hinojos, huelva, andalucía",37.3006,-6.3783,...,,,,,,,,,Sonus naturalis,mp3//Sonus-naturalis-317951.mp3
2,508571,Sonus,naturalis,,Soundscape,Nelson Conceição,Portugal,"Santo Estêvão, Tavira, Faro",37.1554,-7.696,...,,,,,,,,,Sonus naturalis,mp3//Sonus-naturalis-508571.mp3


## Creating targets

In [9]:
def get_targets(metadata):
    
    # create targets matching table
    target_df = pd.DataFrame(metadata['Species'].unique())
    target_df.columns = ['Species']
    target_df['Target'] = target_df.index
    
    # add targets to df
    metadata_with_targets = metadata.merge(target_df, on='Species')
    
    return metadata_with_targets

## Modifying `Path` according to new train-test datasets

In [10]:
def get_ogg_path(mp3_path):
    file_name_mp3 = mp3_path.split(sep="mp3//")[-1]
    file_name = file_name_mp3.split(sep=".mp3")[0]
    file_name_ogg = file_name + "_tens.ogg"
    return file_name_ogg

## Creating `metadata_train` and `metadata_test`

In [11]:
path_list = list(metadata['Path'])
path_list_train = []
path_list_test = []

for i in range(0, 2150, 43):

    for j in range(0,10):
        path_list_test.append(path_list[i+j])
    for k in range(10,43):
        path_list_train.append(path_list[i+k])

In [12]:
def get_metadata_df(path_list):
    
    # select only paths from path_list
    metadata_from_path = metadata[metadata['Path'].isin(path_list)].reset_index(drop=True)

    # change path to .odd
    metadata_from_path['Path'] = metadata_from_path['Path'].apply(get_ogg_path)
    
    # add targets to df
    metadata_with_targets = get_targets(metadata_from_path)
    
    return metadata_with_targets

In [13]:
metadata_train = get_metadata_df(path_list_train)
metadata_test = get_metadata_df(path_list_test)

In [15]:
metadata_train

Unnamed: 0,Recording_ID,Genus,Specific_epithet,Subspecies,English_name,Recordist,Country,Locality,Latitude,Longitude,...,Other_species24,Other_species25,Other_species26,Other_species27,Other_species28,Other_species29,Other_species30,Species,Path,Target
0,447407,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"Córdoba, Córdoba, Andalucía",37.9413,-4.8958,...,,,,,,,,Sonus naturalis,Sonus-naturalis-447407_tens.ogg,0
1,387437,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"el planerón, belchite, zaragoza, aragón",41.2784,-0.7328,...,,,,,,,,Sonus naturalis,Sonus-naturalis-387437_tens.ogg,0
2,383228,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"río guadalmellato, córdoba, andalucía",38.0306,-4.6698,...,,,,,,,,Sonus naturalis,Sonus-naturalis-383228_tens.ogg,0
3,358240,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"doñana visitable, sevilla, andalucía",37.1058,-6.2577,...,,,,,,,,Sonus naturalis,Sonus-naturalis-358240_tens.ogg,0
4,397031,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"arroyo guadairilla, alcalá de guadaira, sevill...",37.3072,-5.8286,...,,,,,,,,Sonus naturalis,Sonus-naturalis-397031_tens.ogg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1645,518681,Glaucidium,passerinum,,Eurasian Pygmy Owl,Daniele Baroni,Finland,"Nousis, Åbo, Southwest Finland",60.6657,22.2822,...,,,,,,,,Glaucidium passerinum,Glaucidium-passerinum-518681_tens.ogg,49
1646,401901,Glaucidium,passerinum,passerinum,Eurasian Pygmy Owl,Jarek Matusiak,Russian Federation,"Gmina Dubeninki, Nesterovsky District, Kalinin...",54.3490,22.6561,...,,,,,,,,Glaucidium passerinum,Glaucidium-passerinum-401901_tens.ogg,49
1647,201176,Glaucidium,passerinum,passerinum,Eurasian Pygmy Owl,Bram Piot,France,"Haut-Jura, Farges, Ain",46.1716,5.8689,...,,,,,,,,Glaucidium passerinum,Glaucidium-passerinum-201176_tens.ogg,49
1648,408254,Glaucidium,passerinum,,Eurasian Pygmy Owl,Miklos Heincz,Hungary,"Velem, Kőszegi, Vas County",47.3411,16.4901,...,,,,,,,,Glaucidium passerinum,Glaucidium-passerinum-408254_tens.ogg,49


In [16]:
metadata_test

Unnamed: 0,Recording_ID,Genus,Specific_epithet,Subspecies,English_name,Recordist,Country,Locality,Latitude,Longitude,...,Other_species24,Other_species25,Other_species26,Other_species27,Other_species28,Other_species29,Other_species30,Species,Path,Target
0,356824,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"doñana visitable, sevilla, andalucía",37.1058,-6.2577,...,,,,,,,,Sonus naturalis,Sonus-naturalis-356824_tens.ogg,0
1,317951,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"arroyo algarbe, hinojos, huelva, andalucía",37.3006,-6.3783,...,,,,,,,,Sonus naturalis,Sonus-naturalis-317951_tens.ogg,0
2,508571,Sonus,naturalis,,Soundscape,Nelson Conceição,Portugal,"Santo Estêvão, Tavira, Faro",37.1554,-7.6960,...,,,,,,,,Sonus naturalis,Sonus-naturalis-508571_tens.ogg,0
3,448534,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"Córdoba, Córdoba, Andalucía",37.9321,-4.9446,...,,,,,,,,Sonus naturalis,Sonus-naturalis-448534_tens.ogg,0
4,373414,Sonus,naturalis,,Soundscape,Lars Lachmann,Germany,"Frohnau, Berlin, north-eastern part",52.6408,13.2956,...,,,,,,,,Sonus naturalis,Sonus-naturalis-373414_tens.ogg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,112798,Glaucidium,passerinum,,Eurasian Pygmy Owl,Patrik Åberg,Sweden,"Tivedens National Park, Västergötland",58.7098,14.6201,...,,,,,,,,Glaucidium passerinum,Glaucidium-passerinum-112798_tens.ogg,49
496,399248,Glaucidium,passerinum,,Eurasian Pygmy Owl,Jarek Matusiak,Poland,"Sobibór, włodawski, lubelskie",51.4443,23.5967,...,,,,,,,,Glaucidium passerinum,Glaucidium-passerinum-399248_tens.ogg,49
497,511909,Glaucidium,passerinum,,Eurasian Pygmy Owl,Baltasar Pinheiro,Sweden,"Uppsala, Uppsala län",59.9824,18.3484,...,,,,,,,,Glaucidium passerinum,Glaucidium-passerinum-511909_tens.ogg,49
498,344245,Glaucidium,passerinum,,Eurasian Pygmy Owl,Patrik Åberg,Sweden,"Laggaretomten, Fröstorp, Tibro, Västergötland",58.4167,14.2167,...,,,,,,,,Glaucidium passerinum,Glaucidium-passerinum-344245_tens.ogg,49


## Creating `y_train` and `y_test` 

In [17]:
def get_target_df(metadata):
    return metadata.copy()[['Path', 'Target']]

In [18]:
y_train = get_target_df(metadata_train)
y_test = get_target_df(metadata_test)

In [19]:
y_train

Unnamed: 0,Path,Target
0,Sonus-naturalis-447407_tens.ogg,0
1,Sonus-naturalis-387437_tens.ogg,0
2,Sonus-naturalis-383228_tens.ogg,0
3,Sonus-naturalis-358240_tens.ogg,0
4,Sonus-naturalis-397031_tens.ogg,0
...,...,...
1645,Glaucidium-passerinum-518681_tens.ogg,49
1646,Glaucidium-passerinum-401901_tens.ogg,49
1647,Glaucidium-passerinum-201176_tens.ogg,49
1648,Glaucidium-passerinum-408254_tens.ogg,49


In [20]:
y_test

Unnamed: 0,Path,Target
0,Sonus-naturalis-356824_tens.ogg,0
1,Sonus-naturalis-317951_tens.ogg,0
2,Sonus-naturalis-508571_tens.ogg,0
3,Sonus-naturalis-448534_tens.ogg,0
4,Sonus-naturalis-373414_tens.ogg,0
...,...,...
495,Glaucidium-passerinum-112798_tens.ogg,49
496,Glaucidium-passerinum-399248_tens.ogg,49
497,Glaucidium-passerinum-511909_tens.ogg,49
498,Glaucidium-passerinum-344245_tens.ogg,49


## Exporting

In [21]:
!pwd

/Users/Charlotte/code/charlottesuaud/birds/raw_data


In [22]:
metadata_train.to_csv("metadata_train.csv", index=False)

metadata_test.to_csv("metadata_test.csv", index=False)

y_train.to_csv("y_train.csv", index=False)

y_test.to_csv("y_test.csv", index=False)

In [23]:
train_csv = pd.read_csv("metadata_train.csv")
train_csv.head(3)

Unnamed: 0,Recording_ID,Genus,Specific_epithet,Subspecies,English_name,Recordist,Country,Locality,Latitude,Longitude,...,Other_species24,Other_species25,Other_species26,Other_species27,Other_species28,Other_species29,Other_species30,Species,Path,Target
0,447407,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"Córdoba, Córdoba, Andalucía",37.9413,-4.8958,...,,,,,,,,Sonus naturalis,Sonus-naturalis-447407_tens.ogg,0
1,387437,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"el planerón, belchite, zaragoza, aragón",41.2784,-0.7328,...,,,,,,,,Sonus naturalis,Sonus-naturalis-387437_tens.ogg,0
2,383228,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"río guadalmellato, córdoba, andalucía",38.0306,-4.6698,...,,,,,,,,Sonus naturalis,Sonus-naturalis-383228_tens.ogg,0


In [24]:
test_csv = pd.read_csv("metadata_test.csv")
test_csv.head(3)

Unnamed: 0,Recording_ID,Genus,Specific_epithet,Subspecies,English_name,Recordist,Country,Locality,Latitude,Longitude,...,Other_species24,Other_species25,Other_species26,Other_species27,Other_species28,Other_species29,Other_species30,Species,Path,Target
0,356824,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"doñana visitable, sevilla, andalucía",37.1058,-6.2577,...,,,,,,,,Sonus naturalis,Sonus-naturalis-356824_tens.ogg,0
1,317951,Sonus,naturalis,,Soundscape,José Carlos Sires,Spain,"arroyo algarbe, hinojos, huelva, andalucía",37.3006,-6.3783,...,,,,,,,,Sonus naturalis,Sonus-naturalis-317951_tens.ogg,0
2,508571,Sonus,naturalis,,Soundscape,Nelson Conceição,Portugal,"Santo Estêvão, Tavira, Faro",37.1554,-7.696,...,,,,,,,,Sonus naturalis,Sonus-naturalis-508571_tens.ogg,0


In [25]:
y_train_csv = pd.read_csv("y_train.csv")
y_train_csv.head(3)

Unnamed: 0,Path,Target
0,Sonus-naturalis-447407_tens.ogg,0
1,Sonus-naturalis-387437_tens.ogg,0
2,Sonus-naturalis-383228_tens.ogg,0


In [26]:
y_test_csv = pd.read_csv("y_test.csv")
y_test_csv.head(3)

Unnamed: 0,Path,Target
0,Sonus-naturalis-356824_tens.ogg,0
1,Sonus-naturalis-317951_tens.ogg,0
2,Sonus-naturalis-508571_tens.ogg,0
