In [4]:
from __future__ import print_function
from __future__ import division

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.feature_selection import f_regression
from scipy.stats import pearsonr

In [41]:
dev_visual_features = np.load('../data/dev_visual_features.npy')
dev_annotations = np.load('../data/dev_annotations.npy')
dev_visual_features = preprocessing.minmax_scale(dev_visual_features, feature_range=(0, 1), axis=0)

variance_selection = VarianceThreshold(threshold=0.01)
bestk_selection_0 = SelectKBest(f_regression, k=100)
bestk_selection_1 = SelectKBest(f_regression, k=100)

dev_visual_features_selected = variance_selection.fit_transform(dev_visual_features)
print(dev_visual_features_selected.shape)
dev_visual_features_selected_0 = bestk_selection_0.fit_transform(dev_visual_features_selected, dev_annotations[:,0])
dev_visual_features_selected_1 = bestk_selection_1.fit_transform(dev_visual_features_selected, dev_annotations[:,1])

test_visual_features = np.load('../data/test_visual_features.npy')
test_annotations = np.load('../data/test_annotations.npy')
test_visual_features = preprocessing.minmax_scale(test_visual_features, feature_range=(0, 1), axis=0)

test_visual_features_selected = variance_selection.transform(test_visual_features)
test_visual_features_selected_0 = bestk_selection_0.transform(test_visual_features_selected)
test_visual_features_selected_1 = bestk_selection_1.transform(test_visual_features_selected)

(5274, 647)


In [42]:
# decision tree
model = DecisionTreeRegressor(max_depth=None, min_samples_split=2, random_state=0)
model.fit(dev_visual_features_selected_0, dev_annotations[:,0])
dt_pred_0 = model.predict(test_visual_features_selected_0)
model.fit(dev_visual_features_selected_1, dev_annotations[:,1])
dt_pred_1 = model.predict(test_visual_features_selected_1)

In [43]:
print(mean_squared_error(test_annotations[:,0], dt_pred_0), pearsonr(test_annotations[:,0], dt_pred_0))
print(mean_squared_error(test_annotations[:,1], dt_pred_1), pearsonr(test_annotations[:,1], dt_pred_1))

0.181803164128 (0.038671295421262068, 0.0034708632846477678)
0.14576998356 (0.10025378891999739, 3.1232004544929717e-14)


In [55]:
# random forest
model = RandomForestRegressor(n_estimators=10, criterion='mse',
                              max_depth=None, min_samples_split=2,
                              min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                              max_features='auto', max_leaf_nodes=None)

In [56]:
model.fit(dev_visual_features_selected_0, dev_annotations[:,0])
rf_pred_0 = model.predict(test_visual_features_selected_0)
model.fit(dev_visual_features_selected_1, dev_annotations[:,1])
rf_pred_1 = model.predict(test_visual_features_selected_1)

In [57]:
print(mean_squared_error(test_annotations[:,0], rf_pred_0), pearsonr(test_annotations[:,0], rf_pred_0))
print(mean_squared_error(test_annotations[:,1], rf_pred_1), pearsonr(test_annotations[:,1], rf_pred_1))

0.0997488590709 (0.097006061621300016, 2.043129143299932e-13)
0.0953962143024 (0.042366559330818364, 0.0013640764297415974)
