In [25]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.semi_supervised import LabelPropagation
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,track_id,track_name,popularity,playlist_id,danceability,energy,acousticness,valence,tempo
0,4f13AaRyukUTWleyKXaRDh,Smoke Filled Room,59,0,0.574,0.731,0.024,0.393,127.925
1,3bNopejTiuFxDpB7CZhaRX,Almost Famous,34,0,0.699,0.521,0.0339,0.0906,124.051
2,0CAfXk7DXMnon4gLudAp7J,Low (feat T-Pain),73,0,0.918,0.609,0.0928,0.304,128.008
3,5v54ZQ0glzbB9XUPTaX0cj,Apollo - Radio Edit,49,0,0.56,0.742,0.015,0.27,128.06
4,5sYzg83Vy4IW4TfE9jwk8g,Invincible - Radio Edit,49,0,0.613,0.775,0.00678,0.385,127.906


## Goals:
- Develop a new algorithm for separating songs
- Prove its effectiveness
- Only use songs with high probability of belonging to label

In [3]:
data = df.values
labeled = pd.read_csv('labeled.csv')
for idx, name in enumerate(df['track_name']):
    
propagated, unlabeled = train_test_split(data, test_size=30)

In [None]:
unlabeled_df = pd.DataFrame(unlabeled)
unlabeled_df.columns = df.columns
unlabeled_df

## Moods:
    0: Depressed
    1: Excited
    2: Aggressive
    3: Relaxing
    4: Cheerful

In [5]:
labels = []
for song in unlabeled_df['track_name']:
    labels.append(input(song + ': '))

Jump (feat. Nelly Furtado): 1
Somebody That I Used To Know: 0
Animal: 2
Going Under: 1
Voices - Markus Maximus TRAP Remix: 2
A Word From Our Sponsor: 2
Dímelo: 2
Piercing Light (feat. Mako): 3
Ghosts 'n' Stuff - Nero Mix: 1
Don't Stop the Party: 4
Boomerang: 1
When I'm Gone - Album Version (Edited): 0
Dicks out for Harambe (Remix) [feat. Baked Alaska]: 1
Good Life: 4
Comin' Up: 2
Siren - Extended Mix: 1
Breakn' A Sweat: 2
Speakerbox (feat. Lafa Taylor): 2
Empathy: 1
In My Head: 3
Dare You - Radio Edit: 4
Smoke Filled Room: 3
Exogenesis: 2
Flesh and Bones: 3
Enigma (feat. GRRL PAL): 2
Say It Ain't So: 0
Whatever It Takes: 4
The Knowing: 3
One Day - New Album Version: 4
Yeah!: 2


In [24]:
temp_df = pd.DataFrame(labels)
temp_df.columns = ['mood']
final_df = pd.concat([unlabeled_df, temp_df], axis=1)
final_df.to_csv('labeled.csv')

In [23]:
fit_X = final_df.iloc[:, 3:8].values
fit_y = final_df['mood'].values
print(propagated[:, 4:])
lp = LabelPropagation(alpha=0.5, max_iter=100)
lp.fit(fit_X, fit_y)
propagated_labels = lp.predict(propagated[:, 4:])
lp.n_iter_, lp.label_distributions_

[[0.316 0.945 0.00895 0.303 189.93099999999998]
 [0.532 0.868 0.0502 0.509 173.94799999999998]
 [0.578 0.893 6.15e-05 0.37 126.01799999999999]
 ..., 
 [0.41200000000000003 0.737 0.0346 0.392 126.324]
 [0.6409999999999999 0.9129999999999999 0.0402 0.342 139.97799999999998]
 [0.495 0.743 0.0007480000000000001 0.14800000000000002 140.047]]


(99, array([[  1.21568789e-115,   3.76670968e-001,   3.80725606e-001,
           4.78518291e-078,   2.42603426e-001],
        [  5.18633653e-001,   1.23392218e-036,   4.81366347e-001,
           7.93287117e-187,   8.68334212e-116],
        [  8.98646347e-183,   0.00000000e+000,   1.00000000e+000,
           1.46610142e-140,   2.76101938e-192],
        [  3.35367078e-148,   1.00000000e+000,   3.05927791e-138,
           9.04309764e-140,   1.72008269e-139],
        [  1.05176460e-010,   3.83173093e-001,   6.16826907e-001,
           8.19765650e-057,   2.02601799e-046],
        [  4.81366347e-001,   1.06807892e-036,   5.18633653e-001,
           6.00567625e-187,   6.94162222e-116],
        [  2.41499882e-046,   4.93076312e-001,   5.06923688e-001,
           2.56747247e-141,   2.79270508e-141],
        [  1.25141379e-042,   2.38788393e-226,   1.46610142e-140,
           1.00000000e+000,   4.85659902e-052],
        [  2.26891703e-046,   5.06923688e-001,   4.93076312e-001,
           2.45780

In [10]:
propagated_df = pd.concat([ pd.DataFrame(propagated), pd.DataFrame(propagated_labels)], axis=1)
propagated_df.columns = final_df.columns
propagated_df

Unnamed: 0,track_id,track_name,popularity,playlist_id,danceability,energy,acousticness,valence,tempo,mood
0,0COqiPhxzoWICwFCS4eZcp,Bring Me To Life,70,26,0.316,0.945,0.00895,0.303,189.931,0
1,2OXK1ShksXD51tOLsE0E7s,Tidal Wave,47,18,0.532,0.868,0.0502,0.509,173.948,0
2,1Thv8uCYzyOFC7PME9J936,The Island - Pt. I (Dawn),57,5,0.578,0.893,6.15e-05,0.37,126.018,0
3,3CzOoYfw7mj5jMHGsJp9TG,I'm So Sick,57,4,0.434,0.961,8.45e-05,0.26,153.969,0
4,0pSIJCxYqZNIZBTAnIXOkv,Lighters,32,39,0.628,0.659,0.328,0.154,89.703,0
5,4c9FIjxXYYEyD9iH02fvbu,Animus Vox,53,10,0.68,0.533,0.161,0.404,99.946,0
6,1EzrEOXmMH3G43AXT1y7pA,I'm Yours,80,12,0.686,0.457,0.595,0.711,150.953,0
7,1wqBQgOX5ahKEDHF7bzsTC,Next,35,28,0.621,0.685,0.734,0.408,112.121,0
8,4JNpKcFjVFYIzt1D95dmi0,Oh Miah,43,12,0.753,0.532,0.267,0.334,140.037,0
9,6TwfdLbaxTKzQi3AgsZNzx,Daylight,51,0,0.658,0.651,0.00344,0.385,119.991,0


In [11]:
propagated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 706 entries, 0 to 705
Data columns (total 10 columns):
track_id        706 non-null object
track_name      706 non-null object
popularity      706 non-null object
playlist_id     706 non-null object
danceability    706 non-null object
energy          706 non-null object
acousticness    706 non-null object
valence         706 non-null object
tempo           706 non-null object
mood            706 non-null object
dtypes: object(10)
memory usage: 55.2+ KB


In [12]:
propagated_df[ propagated_df['mood'] == 2]

Unnamed: 0,track_id,track_name,popularity,playlist_id,danceability,energy,acousticness,valence,tempo,mood
