In [19]:
import pandas as pd
import numpy as np

In [20]:
from zipfile import ZipFile

In [21]:
from sklearn.feature_extraction import DictVectorizer

In [22]:
from sklearn.datasets import dump_svmlight_file
from sklearn import cross_validation

In [23]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn import cross_validation
from sklearn import grid_search

In [24]:
def gen_store_feat(log=True):
    df_store = pd.read_csv('./data/store.csv')
    if log:
        df_store.CompetitionDistance.fillna(df_store.CompetitionDistance.mean(),inplace=True)
        df_store.loc[:,'CompetitionDistance'] = df_store.loc[:,'CompetitionDistance'].map(lambda n:int(np.log(n)))
    df_store.loc[:,'CompetitionOpenSinceMonth'] = df_store.CompetitionOpenSinceMonth.map(np.isnan)
    df_store = df_store[['Store','StoreType','Assortment','CompetitionDistance','CompetitionOpenSinceMonth','Promo2']]
    return df_store

In [25]:
dv = DictVectorizer()

In [26]:
def gen_X_train(df_store):
    z = ZipFile('./data/train.zip','r')
    df_train = pd.read_csv(z.open('train.csv'))
    df_train = df_train[df_train.Sales!=0]
    df_train = df_train[['Store','DayOfWeek','Date','Promo','StateHoliday','SchoolHoliday','Sales']]
    df = pd.merge(df_train,df_store,on='Store',how='outer')
    y = df.Sales
    df = df[[c for c in df.columns if c!='Sales']]
    df = df.applymap(str)
    X = dv.fit_transform(df.to_dict('records'))
    return X,y

In [27]:
def gen_X_pred():
    df_test = pd.read_csv('./data/test.csv')
    df_test = df_test[['Id','Store','DayOfWeek','Date','Promo','StateHoliday','SchoolHoliday']]
    df = pd.merge(df_test,df_store,on='Store')
    df.sort('Id',inplace=True)
    id_ = df['Id']
    df = df[[col for col in df.columns if col not in ['Id']]]
    X_pred = dv.transform(df.applymap(str).to_dict('record'))
    return X_pred

In [28]:
def RMSPE(y_pred,y):
    y = np.array(y).astype(np.float64)
    return np.sqrt(np.power((1-y_pred/y),2.).mean())    

In [None]:
#load features
df_store = gen_store_feat()
X,y = gen_X_train(df_store)

In [None]:
for tr,te in cross_validation.ShuffleSplit(y.shape[0],n_iter=1,train_size=0.8,test_size=0.2):
    X_train,y_train = X[tr],y[tr]
    X_test,y_test = X[te],y[te]
dump_svmlight_file(X_train,y_train,'train_0.8.libfm')
dump_svmlight_file(X_test,y_test,'test_0.2.libfm')

In [12]:
fm = FM(task='r', train='train_0.9.libfm', test='test_0.1.libfm', dim='1,1,12', save_model='fm')
fm.fit()
fm.predict(load_model='fm', test='./test_0.1.libfm', out='y_pred')
y_pred = pd.read_csv('./y_pred',header=None).values
y_pred = y_pred.reshape(1,-1)[0]
print('RMSPE :',RMSPE(y_pred,y_test))