In [1]:
""" Read data dumps from disk. """

import json
import os

import track

# Folder to read dumps from
folder = 'dump'

# List of dictionaries
dicts = []

for file in os.listdir(folder):
    with open('%s/%s' % (folder, file)) as inp:
        try:
            data = json.load(inp)
        except ValueError:
            print(file)
            continue
        else:
            dicts.append(track.features(**data))

print("%d files processed." % len(dicts))

1517 files processed.


In [2]:
""" Build a dataframe. """

import numpy as np
import pandas as pd

# Convert list of dicts to a dataframe
df = pd.DataFrame(dicts)

# Reorder the columns
df = df[track.feature_list]

# Sort by Listeners
df.sort(["Listeners"], ascending=False, inplace=True)

# Split tracks into 3 categories: "Hit", "Avg", "Flop"
# Top 33% of tracks are Hits, next 33% are Avg etc.
class_labels = np.array(["Hit", "Avg", "Flop"])
df.insert(0, 'class', pd.qcut(range(len(df)), 3, labels=class_labels))

In [None]:
""" Data. """

from sklearn.cross_validation import train_test_split

# Skip the first 7 columns as they are mostly metadata
features = df.columns[7:]

# Data
X = df[features]
y, _ = pd.factorize(df['class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
""" SVM Classifier. """

from sklearn import svm
clf = svm.SVC(kernel='linear', C=0.1)

clf.fit(X_train, y_train)

clf.score(X_test, y_test)

In [None]:
""" Random Forest Classifier. """

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_features=0.33, n_jobs=2, n_estimators=84)

clf.fit(X_train, y_train)

pd.crosstab(class_labels[clf.predict(X_test)], test['class'], rownames=['actual'], colnames=['pred'])

In [None]:
""" Cross Validation. """

from sklearn.cross_validation import cross_val_score

cross_val_score(clf, X, Y)

In [None]:
""" Automatic Feature Selection. """

X = train[features]
y, _ = pd.factorize(train['class'])
print(X.shape)

from sklearn.ensemble import ExtraTreesClassifier
X_new = ExtraTreesClassifier().fit_transform(X, y)
print(X_new.shape)

from sklearn.svm import LinearSVC
X_new = LinearSVC(C=0.01, penalty="l1", dual=False).fit_transform(X, y)
print(X_new.shape)

from sklearn.feature_selection import VarianceThreshold
X_new = VarianceThreshold(threshold=(.8 * (1 - .8))).fit_transform(X, y)
print(X_new.shape)

# TODO: Find names of columns that have important features