# Initialisation
 - nb = 0 => extraire info
 - 1ere substance sortie
 - XGBoost

KeyError: 'substances'

In [27]:
import pandas as pd
import numpy as np
from time import time
import pickle

from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, RandomForestRegressor
from sklearn.preprocessing import normalize, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline, make_pipeline

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

data_dir = 'D:/Dev/dataScience/kaggle/challenge_25_data/data/'

# features numériques
feat_num = ['libelle_plaquette', 'libelle_ampoule', 'libelle_flacon', 'libelle_tube', 'libelle_stylo', 'libelle_seringue',
            'libelle_pilulier', 'libelle_sachet', 'libelle_comprime', 'libelle_gelule', 'libelle_film', 'libelle_poche',
            'libelle_capsule'] + ['nb_plaquette', 'nb_ampoule', 'nb_flacon', 'nb_tube', 'nb_stylo', 'nb_seringue',
            'nb_pilulier', 'nb_sachet', 'nb_comprime', 'nb_gelule', 'nb_film', 'nb_poche', 'nb_capsule', 'nb_ml']
# features date
feat_dates = ['date declar annee', 'date amm annee']
# features catégorielles
feat_cat = ['statut', 'etat commerc', 'agrement col', 'tx rembours', 'voies admin', 'statut admin', 'type proc']
# features texte
feat_text = ['libelle', 'titulaires', 'substances', 'forme pharma']

def mape_error(log_y_true, log_y_pred): 
    y_true = np.exp(log_y_true)
    y_pred = np.exp(log_y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mape_error, greater_is_better = False)

# Import des données
 - ```train``` : 8564 medicaments / 41 variables
 - ```test``` : 3671 medicaments 

In [24]:
# fix random seed for reproducibility
seed = 43
np.random.seed(seed)
train = pd.read_csv(data_dir + 'boites_medicaments_train.csv',encoding='utf-8',sep=';')
test = pd.read_csv(data_dir + 'boites_medicaments_test.csv',encoding='utf-8',sep=';')

# Preparation des donnees

### Encodage des features catégorielles

Les algorithmes de machine learning s'attendent à avoir en entrée des nombres, et non pas des chaînes de caractères. C'est pourquoi nous transformons les features catégorielles en nombres, à l'aide de LabelEncoder()

In [25]:
for c in feat_cat:
    le = LabelEncoder()
    le.fit(train[c].append(test[c]))
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

 ### Split des catégories multi-valeurs
 Les catégories dont les valeurs sont des listes d'éléments sont développées sous forme de n catégories binaires (n étant le nombre d'éléments distincts pour la catégorie dans le dataset) 

In [26]:
def expandedOHE(train_df, test_df, colName):
    # type: (DataFrame, DataFrame, str) -> *str
    distinctCategs = (train_df[colName]
                      .apply(lambda st : st.split(','))
                      .apply(pd.Series)
                      .unstack()
                      .dropna()
                      .str.strip()
                      .unique())
    for categorie in distinctCategs:
        train_df[categorie] = train_df[colName].apply(lambda x : 1 if categorie in x else 0)
        test_df[categorie] = test_df[colName].apply(lambda x : 1 if categorie in x else 0)
    return list(distinctCategs) 

## le split de la categorie "substances" permet de passer de 45 à 32%
feat_substances = expandedOHE(train, test, 'substances') 
## le split de la categorie "voies admin" dégrade l'estimation
# feat_substances = expandedOHE(train, test, 'substances') 



### Quantites et prix unitaire (prix/quantite)

In [28]:
train['logprix'] = train['prix'].apply(np.log)

In [None]:
train['cst1'] = 1
test['cst1'] = 1
train['nb'] = (train[['nb_plaquette', 'nb_ampoule', 'nb_flacon', 'nb_tube', 'nb_ampoule', 'nb_flacon', 'nb_tube', 
              'nb_stylo', 'nb_seringue', 'nb_pilulier', 'nb_sachet', 'nb_film', 'nb_poche', 'cst1']].max(axis=1) 
              * train[['nb_comprime', 'nb_gelule', 'nb_capsule', 'nb_ml', 'cst1']].max(axis=1))
test['nb'] = (test[['nb_plaquette', 'nb_ampoule', 'nb_flacon', 'nb_tube', 'nb_ampoule', 'nb_flacon', 'nb_tube', 
              'nb_stylo', 'nb_seringue', 'nb_pilulier', 'nb_sachet', 'nb_film', 'nb_poche', 'cst1']].max(axis=1) 
              * test[['nb_comprime', 'nb_gelule', 'nb_capsule', 'nb_ml', 'cst1']].max(axis=1))

# Visualisation distribution 'logprixunit'
train['prixunit'] = train['nb'] * train['prix']
train['logprixunit'] = train['prixunit'].apply(np.log)

### Enregistrement

In [29]:
train.to_csv(data_dir + 'train_df.csv', encoding='utf-8', sep=';')
test.to_csv(data_dir + 'test_df.csv', encoding='utf-8', sep=';')

with open(data_dir + 'substances.pkl', 'wb') as f:
    pickle.dump(feat_substances, f)

# Creation d'un modele

### Initialisation 
(imports + chargement des datasets préparés)

In [29]:
import pandas as pd
import numpy as np
from time import time
import pickle

from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, RandomForestRegressor
from sklearn.preprocessing import normalize, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline, make_pipeline

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

data_dir = 'D:/Dev/dataScience/kaggle/challenge_25_data/data/'

# features numériques
feat_num = ['libelle_plaquette', 'libelle_ampoule', 'libelle_flacon', 'libelle_tube', 'libelle_stylo', 'libelle_seringue',
            'libelle_pilulier', 'libelle_sachet', 'libelle_comprime', 'libelle_gelule', 'libelle_film', 'libelle_poche',
            'libelle_capsule'] + ['nb_plaquette', 'nb_ampoule', 'nb_flacon', 'nb_tube', 'nb_stylo', 'nb_seringue',
            'nb_pilulier', 'nb_sachet', 'nb_comprime', 'nb_gelule', 'nb_film', 'nb_poche', 'nb_capsule', 'nb_ml']
# features date
feat_dates = ['date declar annee', 'date amm annee']
# features catégorielles
feat_cat = ['statut', 'etat commerc', 'agrement col', 'tx rembours', 'voies admin', 'statut admin', 'type proc']
# features texte
feat_text = ['libelle', 'titulaires', 'substances', 'forme pharma']

def mape_error(log_y_true, log_y_pred): 
    y_true = np.exp(log_y_true)
    y_pred = np.exp(log_y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def grid_search_mape(estimator, X, Y, parameters, nb_folds):
    print("Performing grid search...")
    grid_search = GridSearchCV(estimator, parameters, scoring=mape_scorer, cv=nb_folds)
    
    print "pipeline:   "  + str([name for name, _ in pipeline.steps])
    t0 = time()
    grid_search.fit(X, Y)
    print "=> done in %0.3fs" % (time() - t0)
    print "Best score: %0.3f" % grid_search.best_score_
    print "Best parameters set:"
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    res = grid_search.cv_results_
    print "Results by parameters:"
    for i in range(0, len(res.get('params'))):
        p = res.get('params')[i]
        m = res.get('mean_test_score')[i]
        s = res.get('std_test_score')[i]
        print p
        print '\tMape: %.2f\t(std: %.2f)' % (m,s)
    return grid_search

mape_scorer = make_scorer(mape_error, greater_is_better = False)

# chargement des datasets préparés
train = pd.read_csv(data_dir + 'train_df.csv',encoding='utf-8',sep=';')
test = pd.read_csv(data_dir + 'test_df.csv',encoding='utf-8',sep=';')

with open(data_dir + 'substances.pkl', 'rb') as f:
    feat_substances = pickle.load(f)

### Regressor

In [None]:
feats = feat_num + feat_cat + feat_substances ## chug on ajoute les feat_substances

Y = train['logprix']
X = train[feats]

#GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor, RandomForestRegressor
reg = RandomForestRegressor(n_jobs=4) 
pipeline = make_pipeline(StandardScaler(), reg) #, 

parameters = {'randomforestregressor__n_estimators':  [10, 20, 50, 100], 
              'randomforestregressor__min_impurity_split': [1e-5, 1e-6, 1e-7, 1e-8]}
                #,'randomforestregressor__max_features': ['auto', 'sqrt']}

grid_search_mape(pipeline, X, Y, parameters, 5)

### Neural network

In [None]:
# split into input (X) and output (Y) variables
FEATURES = feat_num + feat_cat + feat_substances ## chug on ajoute les feat_substances

# create model
# hyperopt / bash normalization / dropout / normalization entre les couches / 800 epoch / batch 50 / early stopping 
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(10, input_dim=len(FEATURES), init='normal', activation='relu'))
    #model.add(Dense(15, init='normal', activation='relu'))
    #model.add(Dense(6, init='normal', activation='sigmoid'))
    model.add(Dense(1, init='normal'))
    # Compile model
    model.compile(loss='mape', optimizer='rmsprop')
    return model

X = train[FEATURES]
Y = train['logprix']

reg = KerasRegressor(build_fn=baseline_model, nb_epoch=3, verbose=0)
pipeline = make_pipeline(StandardScaler(), reg)

parameters = {'kerasregressor__nb_epoch':  [2], 'kerasregressor__batch_size': [20,50]}
grid_search = grid_search_mape(pipeline, X, Y, parameters, 2)

'''
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold, scoring=mape_scorer, verbose=1)
print ("\nResults: %.2f (%.2f) MAPE" % (results.mean(), results.std()))
'''

# Predictions et soumission

In [26]:
# configure regressor with best params from grid_search
reg = ExtraTreesRegressor(n_estimators=20, n_jobs=4)
pipeline = make_pipeline(StandardScaler(), reg)

# fit on full train dataset
pipeline.fit(train[feats], train['logprix'])

# predicttest prices
predictions = np.exp(pipeline.predict(test[feats]))

# write to soumission.csv
pd.DataFrame(predictions, index=test['id']).to_csv(data_dir + 'soumission.csv',  
                          header=['prix'],
                          sep = ';')

Estimations de base : 65%

Optimisations :
- Ajout catégorie "substances" : 45%
- Ajout catégorie "substances splitées" : 
    - FR : 32,5%
    - ET : 33,9%
    - GB : 69,6%
    - AB : 129%
    - BR : 35,0 %
    

Dégradations :
- logprixnuit : +7/8%