In [28]:
""" Read data dumps from disk. """

import json
import os

import track

# Folder to read dumps from
folder = 'dump'

# List of dictionaries
dicts = []

for file in os.listdir(folder):
    with open('%s/%s' % (folder, file)) as inp:
        try:
            data = json.load(inp)
        except ValueError:
            print(file)
        else:
            try:
                dicts.append(track.features(**data))
            except KeyError:
                print(file)

print("%d files processed." % len(dicts))

6268 files processed.


In [None]:
""" Build a dataframe. """

import numpy as np
import pandas as pd

# Convert list of dicts to a dataframe
df = pd.DataFrame(dicts)

# Reorder the columns
df = df[track.feature_list]

# Convert these columns to integers
to_int = ['Listeners', 'Playcount', 'Duration']
df[to_int] = df[to_int].astype(int)

# Sort by Listeners
df.sort(["Listeners"], ascending=False, inplace=True)

# Split tracks into 3 categories: "Hit", "Avg", "Flop"
# Top 33% of tracks are Hits, next 33% are Avg etc.
class_labels = np.array(["Hit", "Flop"])
df.insert(0, 'Class', pd.qcut(range(len(df)), 2, labels=class_labels))


In [None]:
""" Is a science demo. """

# Select only some of the features!
feature_list = ["Class", "Name", "Artist", "MBID", "Listeners", "Playcount", "Duration", "ABL - Average Loudness", "ABL - Tempo (BPM)"]
dfn = df[feature_list]

# Remove Zero duration songs!
dfn = dfn[dfn["Duration"] != 0]

# Convert Tempo to categories!
dfn.insert(len(dfn.columns), "Tempo 70-89", ((dfn["ABL - Tempo (BPM)"] > 70) & (dfn["ABL - Tempo (BPM)"] < 89)).apply(int))
dfn.insert(len(dfn.columns), "Tempo 90-109", ((dfn["ABL - Tempo (BPM)"] > 90) & (dfn["ABL - Tempo (BPM)"] < 109)).apply(int))
dfn.insert(len(dfn.columns), "Tempo 110-129", ((dfn["ABL - Tempo (BPM)"] > 110) & (dfn["ABL - Tempo (BPM)"] < 129)).apply(int))
dfn.insert(len(dfn.columns), "Tempo 130-149", ((dfn["ABL - Tempo (BPM)"] > 130) & (dfn["ABL - Tempo (BPM)"] < 149)).apply(int))
dfn.insert(len(dfn.columns), "Tempo 150-169", ((dfn["ABL - Tempo (BPM)"] > 150) & (dfn["ABL - Tempo (BPM)"] < 169)).apply(int))
dfn.insert(len(dfn.columns), "Tempo 170-189", ((dfn["ABL - Tempo (BPM)"] > 170) & (dfn["ABL - Tempo (BPM)"] < 189)).apply(int))

# Drop Tempo Column now
dfn.drop("ABL - Tempo (BPM)", axis=1, inplace=True)

# Convert Duration to Categories!
dfn.insert(len(dfn.columns), "Duration 75-224", ((dfn["Duration"] > 75000) & (dfn["Duration"] < 224000)).apply(int))
dfn.insert(len(dfn.columns), "Duration 225-374", ((dfn["Duration"] > 225000) & (dfn["Duration"] < 374000)).apply(int))
dfn.insert(len(dfn.columns), "Duration 375-524", ((dfn["Duration"] > 375000) & (dfn["Duration"] < 524000)).apply(int))
dfn.insert(len(dfn.columns), "Duration >524", (dfn["Duration"] > 524000).apply(int))

# Classifier below uses a general variable 'fr'
# Here we can assign, fr to df or dfn.
fr = dfn


In [None]:
""" Data. """

from sklearn.cross_validation import train_test_split

# Skip the first 7 columns as they are mostly metadata
features = fr.columns[7:]

# Data
X = fr[features]
y, _ = pd.factorize(fr['Class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
""" SVM Classifier. """

from sklearn import svm
from sklearn.cross_validation import cross_val_score

clf = svm.SVC(kernel='rbf')

clf.fit(X_train, y_train)

# clf.score(X_test, y_test)
cross_val_score(clf, X, y)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets


In [None]:
""" Random Forest Classifier. """

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_features=0.33, n_jobs=2, n_estimators=84)

clf.fit(X_train, y_train)

pd.crosstab(class_labels[clf.predict(X_test)], test['class'], rownames=['actual'], colnames=['pred'])

In [None]:
""" Cross Validation. """

from sklearn.cross_validation import cross_val_score

cross_val_score(clf, X, Y)

In [None]:
""" Automatic Feature Selection. """

X = train[features]
y, _ = pd.factorize(train['class'])
print(X.shape)

from sklearn.ensemble import ExtraTreesClassifier
X_new = ExtraTreesClassifier().fit_transform(X, y)
print(X_new.shape)

from sklearn.svm import LinearSVC
X_new = LinearSVC(C=0.01, penalty="l1", dual=False).fit_transform(X, y)
print(X_new.shape)

from sklearn.feature_selection import VarianceThreshold
X_new = VarianceThreshold(threshold=(.8 * (1 - .8))).fit_transform(X, y)
print(X_new.shape)

# TODO: Find names of columns that have important features