In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the annotations file
newdata = pd.read_csv('annotations_final.csv', sep="\t")

In [3]:
# display the top 5 rows
newdata.head(5)

Unnamed: 0,clip_id,no voice,singer,duet,plucking,hard rock,world,bongos,harpsichord,female singing,...,rap,metal,hip hop,quick,water,baroque,women,fiddle,english,mp3_path
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
1,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
2,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
3,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
4,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...


In [4]:
# Get to know the data better
newdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25863 entries, 0 to 25862
Columns: 190 entries, clip_id to mp3_path
dtypes: int64(189), object(1)
memory usage: 37.5+ MB


In [5]:
# What columns are there?
newdata.columns

Index([u'clip_id', u'no voice', u'singer', u'duet', u'plucking', u'hard rock',
       u'world', u'bongos', u'harpsichord', u'female singing',
       ...
       u'rap', u'metal', u'hip hop', u'quick', u'water', u'baroque', u'women',
       u'fiddle', u'english', u'mp3_path'],
      dtype='object', length=190)

In [6]:
# Extract the clip_id and mp3_path
newdata[["clip_id", "mp3_path"]]

Unnamed: 0,clip_id,mp3_path
0,2,f/american_bach_soloists-j_s__bach_solo_cantat...
1,6,f/american_bach_soloists-j_s__bach_solo_cantat...
2,10,f/american_bach_soloists-j_s__bach_solo_cantat...
3,11,f/american_bach_soloists-j_s__bach_solo_cantat...
4,12,f/american_bach_soloists-j_s__bach_solo_cantat...
5,14,c/lvx_nova-lvx_nova-01-contimune-30-59.mp3
6,19,c/lvx_nova-lvx_nova-01-contimune-175-204.mp3
7,21,c/lvx_nova-lvx_nova-01-contimune-233-262.mp3
8,23,c/lvx_nova-lvx_nova-01-contimune-291-320.mp3
9,25,0/american_bach_soloists-j_s__bach__cantatas_v...


In [7]:
# Previous command extracted it as a DataFrame., We need it as a matrix to do analysis on
# Extract clip_id and mp3_path as a matrix
clip_id, mp3_path = newdata[["clip_id", "mp3_path"]].as_matrix()[:, 0], newdata[["clip_id", "mp3_path"]].as_matrix()[:,1]

In [8]:
# Some of the tags in the dataset are really close to each other. Lets merge them together
synonyms = [['beat', 'beats'],
            ['chant', 'chanting'],
            ['choir', 'choral'],
            ['classical', 'clasical', 'classic'],
            ['drum', 'drums'],
            ['electro', 'electronic', 'electronica', 'electric'],
            ['fast', 'fast beat', 'quick'],
            ['female', 'female singer', 'female singing', 'female vocals', 'female vocal', 'female voice', 'woman', 'woman singing', 'women'],
            ['flute', 'flutes'],
            ['guitar', 'guitars'],
            ['hard', 'hard rock'],
            ['harpsichord', 'harpsicord'],
            ['heavy', 'heavy metal', 'metal'],
            ['horn', 'horns'],
            ['india', 'indian'],
            ['jazz', 'jazzy'],
            ['male', 'male singer', 'male vocal', 'male vocals', 'male voice', 'man', 'man singing', 'men'],
            ['no beat', 'no drums'],
            ['no singer', 'no singing', 'no vocal','no vocals', 'no voice', 'no voices', 'instrumental'],
            ['opera', 'operatic'],
            ['orchestra', 'orchestral'],
            ['quiet', 'silence'],
            ['singer', 'singing'],
            ['space', 'spacey'],
            ['string', 'strings'],
            ['synth', 'synthesizer'],
            ['violin', 'violins'],
            ['vocal', 'vocals', 'voice', 'voices'],
            ['strange', 'weird']]

In [9]:
# Merge the synonyms and drop all other columns than the first one
"""
Example:
Merge 'beat', "beats", and save it to beat.
Merge 'Classical', 'clasical', 'classic' and save it to 'classical'
"""

for synonym_list in synonyms:
    newdata[synonym_list[0]] = newdata[synonym_list].max(axis=1)
    newdata.drop(synonym_list[1:], axis=1, inplace=True)

In [10]:
# Did it work?
newdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25863 entries, 0 to 25862
Columns: 136 entries, clip_id to mp3_path
dtypes: int64(135), object(1)
memory usage: 26.8+ MB


In [11]:
# Lets view it
newdata.head()

Unnamed: 0,clip_id,singer,duet,plucking,world,bongos,harpsichord,sitar,chorus,female opera,...,soft,noise,choir,rap,hip hop,water,baroque,fiddle,english,mp3_path
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
1,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
2,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
3,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
4,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...


In [12]:
# Drop the mp3_path tag from the dataframe
newdata.drop('mp3_path', axis=1, inplace=True)
# Save the column names into variable
data = newdata.sum(axis=0)

In [13]:
# find the distribution of tags
data

clip_id         736770326
singer               1308
duet                   74
plucking               69
world                  65
bongos                 52
harpsichord          1123
sitar                 926
chorus                241
female opera           85
clarinet               49
heavy                 583
woodwind               38
funky                 195
no strings             56
chimes                 77
foreign               275
no piano              324
classical            4358
female               2067
soft rock              54
eerie                  65
jazz                  555
guitar               4861
quiet                1072
no beat               242
banjo                 194
solo                  826
folk                  243
wind                  131
                  ...    
blues                 181
vocal                2813
no singer            2550
scary                  49
india                1402
rock                 2371
dance                 649
piano solo  

In [14]:
# Sort the column names
data.sort_values(axis=0, inplace=True)

In [15]:
# Find the top tags from the dataframe
topindex, topvalues = list(data.index[84:]), data.values[84:]
del(topindex[-1])
topvalues = np.delete(topvalues, -1)

In [16]:
# Get the top column names
topindex

['no beat',
 'folk',
 'trance',
 'foreign',
 'orchestra',
 'baroque',
 'chant',
 'hard',
 'no piano',
 'modern',
 'bass',
 'eastern',
 'country',
 'jazz',
 'cello',
 'heavy',
 'harp',
 'strange',
 'dance',
 'new age',
 'choir',
 'solo',
 'sitar',
 'soft',
 'pop',
 'flute',
 'quiet',
 'loud',
 'harpsichord',
 'opera',
 'singer',
 'india',
 'synth',
 'violin',
 'ambient',
 'piano',
 'female',
 'beat',
 'male',
 'fast',
 'rock',
 'no singer',
 'drum',
 'electro',
 'vocal',
 'string',
 'techno',
 'slow',
 'classical',
 'guitar']

In [17]:
# Get only the top column values
topvalues

array([ 242,  243,  253,  275,  296,  297,  312,  323,  324,  327,  337,
        406,  541,  555,  575,  583,  623,  640,  649,  650,  791,  826,
        926,  985,  995, 1035, 1072, 1086, 1123, 1298, 1308, 1402, 1734,
       1907, 1956, 2056, 2067, 2123, 2169, 2331, 2371, 2550, 2698, 2764,
       2813, 2842, 2954, 3547, 4358, 4861])

In [18]:
# Get a list of columns to remove
rem_cols = data.index[:84]

In [19]:
# Cross-check: How many columns are we removing?
len(rem_cols)

84

In [20]:
# Drop the columns that needs to be removed
newdata.drop(rem_cols, axis=1, inplace=True)

In [21]:
newdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25863 entries, 0 to 25862
Data columns (total 51 columns):
clip_id        25863 non-null int64
singer         25863 non-null int64
harpsichord    25863 non-null int64
sitar          25863 non-null int64
heavy          25863 non-null int64
foreign        25863 non-null int64
no piano       25863 non-null int64
classical      25863 non-null int64
female         25863 non-null int64
jazz           25863 non-null int64
guitar         25863 non-null int64
quiet          25863 non-null int64
no beat        25863 non-null int64
solo           25863 non-null int64
folk           25863 non-null int64
ambient        25863 non-null int64
new age        25863 non-null int64
synth          25863 non-null int64
drum           25863 non-null int64
bass           25863 non-null int64
loud           25863 non-null int64
string         25863 non-null int64
opera          25863 non-null int64
fast           25863 non-null int64
country        25863 non-nu

In [22]:
# Create a backuo of the dataframe
backup_newdata = newdata

In [23]:
# Use this to revive the dataframe newdata = backup_newdata
# Shuffle the dataframe
from sklearn.utils import shuffle
newdata = shuffle(newdata)

In [24]:
#This method may be used to shuffle data 
# By setting frac=1, you ll shuffle every single row randomly,newdata = newdata.sample(frac=1).reset_index(drop=True)
newdata.reset_index(drop=True)

Unnamed: 0,clip_id,singer,harpsichord,sitar,heavy,foreign,no piano,classical,female,jazz,...,india,rock,dance,cello,techno,flute,beat,soft,choir,baroque
0,27286,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2087,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
2,23619,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,30620,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1218,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,33919,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,49622,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,30723,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,17711,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9,27276,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# One final check
newdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25863 entries, 12413 to 8214
Data columns (total 51 columns):
clip_id        25863 non-null int64
singer         25863 non-null int64
harpsichord    25863 non-null int64
sitar          25863 non-null int64
heavy          25863 non-null int64
foreign        25863 non-null int64
no piano       25863 non-null int64
classical      25863 non-null int64
female         25863 non-null int64
jazz           25863 non-null int64
guitar         25863 non-null int64
quiet          25863 non-null int64
no beat        25863 non-null int64
solo           25863 non-null int64
folk           25863 non-null int64
ambient        25863 non-null int64
new age        25863 non-null int64
synth          25863 non-null int64
drum           25863 non-null int64
bass           25863 non-null int64
loud           25863 non-null int64
string         25863 non-null int64
opera          25863 non-null int64
fast           25863 non-null int64
country        25863 non

In [26]:
# Let us save the final columns
final_columns_names = list(newdata.columns)

In [27]:
# Do it only once to delete the clip_id column
del(final_columns_names[0])

In [28]:
# Verified
final_columns_names

['singer',
 'harpsichord',
 'sitar',
 'heavy',
 'foreign',
 'no piano',
 'classical',
 'female',
 'jazz',
 'guitar',
 'quiet',
 'no beat',
 'solo',
 'folk',
 'ambient',
 'new age',
 'synth',
 'drum',
 'bass',
 'loud',
 'string',
 'opera',
 'fast',
 'country',
 'violin',
 'electro',
 'trance',
 'chant',
 'strange',
 'modern',
 'hard',
 'harp',
 'pop',
 'piano',
 'orchestra',
 'eastern',
 'slow',
 'male',
 'vocal',
 'no singer',
 'india',
 'rock',
 'dance',
 'cello',
 'techno',
 'flute',
 'beat',
 'soft',
 'choir',
 'baroque']

In [30]:
# Create the file which is to be saved off (you could skip and apply similar steps in the previous dataframe)
# Here, binary 0s and 1s from each column is changed to False and True by using == operator on the dataframe
final_matrix = pd.concat([newdata['clip_id'], newdata[final_columns_names] == 1], axis=1)