In [8]:
import time
from dateutil.parser import parse as dateutil_parser

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

DATA_PATH = './data/'

In [2]:
clean_play = pd.read_csv(DATA_PATH+'naive/play.csv',index_col=0)
clean_download = pd.read_csv(DATA_PATH+'naive/download.csv',index_col=0)
clean_like = pd.read_csv(DATA_PATH+'naive/like.csv',index_col=0)
song = pd.read_csv(DATA_PATH+'mars_tianchi_songs.csv')

In [3]:
song.index = song.song_id
all_songs = song.loc[:,['song_init_plays']]
all_songs['howold'] = (song['publish_time'].map(lambda x: dateutil_parser(str(x))) - dateutil_parser('20150301')).map(lambda x: x.days)
lang = pd.get_dummies(song['Language'],'lang').applymap(lambda x: True if x==1 else False)
gender = pd.get_dummies(song['Gender'],'gender').applymap(lambda x: True if x==1 else False)
dummy_features = lang.columns.union(gender.columns)
all_songs = pd.concat([all_songs,lang,gender],axis=1)

In [4]:
class myStandardScaler(skStandardScaler):
    '''
    Only use fit_transform and tranform.
    And only deal with pd.DataFrame
    '''
    def fit_transform(self, X):
        Xnumerical = X[X.columns[X.dtypes != bool]]
        Xdummy = X[X.columns[X.dtypes == bool]]
        scaledXnumerical = super(myStandardScaler, self).fit_transform(Xnumerical)
        Xnumerical = pd.DataFrame(scaledXnumerical, index=Xnumerical.index, columns=Xnumerical.columns)
        return pd.concat([Xnumerical, Xdummy], axis=1)

    def transform(self, X):
        Xnumerical = X[X.columns[X.dtypes != bool]]
        Xdummy = X[X.columns[X.dtypes == bool]]
        scaledXnumerical = super(myStandardScaler, self).transform(Xnumerical)
        Xnumerical = pd.DataFrame(scaledXnumerical, index=Xnumerical.index, columns=Xnumerical.columns)
        return pd.concat([Xnumerical, Xdummy], axis=1)

In [5]:
train_play = clean_play.iloc[:,:-61].rename(columns=lambda x: str(x)+'_play')
train_download = clean_download.iloc[:,:-61].rename(columns=lambda x: str(x)+'_download')
train_like = clean_like.iloc[:,:-61].rename(columns=lambda x: str(x)+'_like')

Xtrain = pd.concat([all_songs,train_play,train_download,train_like],axis=1).fillna(0)
Ytrain = pd.concat([pd.DataFrame(index=all_songs.index),clean_play.iloc[:,-60:]],axis=1).fillna(0)

test_play = clean_play.iloc[:,61:].rename(columns=lambda x: str(x)+'_play')
test_download = clean_download.iloc[:,61:].rename(columns=lambda x: str(x)+'_download')
test_like = clean_like.iloc[:,61:].rename(columns=lambda x: str(x)+'_like')
Xtest = pd.concat([all_songs,test_play,test_download,test_like],axis=1).fillna(0)

In [6]:
scaler = myStandardScaler()
Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.transform(Xtest)

In [13]:
class Searcher(object):

    def __init__(self, model, params, X, Y, method, n_randomized_search=200, cv=5, n_jobs=-1):
        if method == 'Grid':
            self.__searcher = GridSearchCV(estimator=model,
                                          param_grid=params,
                                          cv=cv, n_jobs=n_jobs,scoring='mean_squared_error')
        elif method == 'Randomized':
            self.__searcher = RandomizedSearchCV(estimator=model,
                                                param_distributions=params,
                                                n_iter=n_randomized_search,
                                                cv=cv, n_jobs=n_jobs, scoring='mean_squared_error')
        else:
            raise ValueError('We only support GridSearch and RandomizedSearch')
        print 'Searching...'
        now = time.time()
        self.__searcher.fit(X, Y)
        self.best_estimator = self.__searcher.best_estimator_
        print 'Searching finished. Totally take %.2fs' % (time.time() - now)

    def report(self, n_top=10):
        grid_scores = self.__searcher.grid_scores_
        top_scores = sorted(grid_scores, key=lambda x: x[
            1], reverse=True)[:n_top]
        for i, score in enumerate(top_scores):
            print("=====================================================")
            print("Model with rank: {0}".format(i + 1))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                np.mean(score.cv_validation_scores), np.std(score.cv_validation_scores)))
            print("Parameters: {0}".format(score.parameters))
            print('')

In [14]:
grid_params = {'n_estimators': [150] ,'max_depth': np.arange(2, 20), 'min_samples_leaf': [1, 3, 10], 
                     'min_samples_split': [1, 3, 10], 'bootstrap': [True, False], 'max_features': ['log2', 'sqrt', None]}

In [15]:
grid_searcher = Searcher(RandomForestRegressor(), params=grid_params, X=Xtrain, Y=Ytrain, method='Grid')

Searching...


KeyboardInterrupt: 