In [20]:
import os
import re
import sklearn
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
import xgboost as xgb

import pickle

from hyperopt import hp, fmin, tpe

from tensorflow.keras.layers import concatenate
from tensorflow.keras import models
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import layers

from tensorflow.compat.v1.losses import mean_squared_error
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF


# Définition des fonctions

## Preparation du text pour l'entrainement du modèle Word2Vec et TF-IDF

In [14]:
def prepare_DF(DF):
    DF_help=DF.apply(func = lambda S:S.lower())
    DF_help=remplacer_site_DF(DF_help)
    DF_help=DF_help.apply(func = lambda S:S.strip())
    DF_help=espacer_virgule_DF(DF_help)
    DF_help=espacer_parenthèse_DF(DF_help)
    DF_help=DF_help.apply(lambda text:text.replace('"',''))
    return prepare_sentences_help(DF_help)

def get_longest_feedback(List):
    return max([len(l) for l in List])

def prepare_sentences_List(DF):
    
    List=[sentence for paragraph in DF.values.tolist() for sentence in paragraph]
    return List

def prepare_sentences_help(DF):
    DF_help=DF.apply(func = lambda S:[s.split(' ') for s in S.split('.')])
    DF_help = DF_help.apply(func = lambda paragraphe : [ [word for word in sentence if not(word in [' ',''])]for sentence in paragraphe ])
    DF_help = DF_help.apply(func = lambda paragraphe : [ sentence for sentence in paragraphe if not(sentence in [[' '],['']] or len(sentence)==0) ])
    
    #DF_help=DF_help.apply(func=lambda paragraph :[ sentence[1:] if sentence[0]=='' else sentence for sentence in paragraph])
    return DF_help

def remplacer_site(String):       
    for index in [m.start() for m in re.finditer('.', String)]:
        if String[:index].rfind('http')>String[:index].rfind(' '):
            String=String[:index]+'&'+String[index+1:]
    return String

def remplacer_site_DF(DF):
    DF=DF.apply(func= lambda S : remplacer_site(S))
    return DF       

def espacer_point_DF(DF):
    DF=DF.apply(func = lambda S:S.replace('.',' .'))
    return DF

def espacer_virgule_DF(DF):
    DF_help=DF.apply(func = lambda S:S.replace(","," ,"))
    return DF_help

def espacer_parenthèse_DF(DF):
    DF=DF.apply(func = lambda S:S.replace('(','( '))
    DF=DF.apply(func = lambda S:S.replace(')',' )'))
    return DF

## Encodage des données textuelles en vecteurs

In [15]:
def generate_dict(sentences,window,size):
    model=Word2Vec(sentences, size=size, window=window, min_count=1)
    return model


"""
cette fonction encode les paragraphe de la colonne feedback sous la forme d'une suite de vecteurs encodant chacun une phrase,
l'encodage d'une phrase est obtenue grâce à la moyenne des encodages des mots composant la phrase, ces encodages étant pondérés
par le coéfficient tf-idf des mots.
"""

def sentence_2v(DF,dico,tfidf,size_embeding, size_paragraph):
    DF_bis=DF.apply(func = sentence_2V_help , dico=dico, tfidf=tfidf, size_embeding=size_embeding, size_paragraph=size_paragraph) 
    return DF_bis

def sentence_2V_help(paragraphe,dico,tfidf,size_embeding,size_paragraph):
    vectorized_paragraphe=[]
    for sentence in paragraphe:
        
        freq_vec=tfidf.transform([' '.join(sentence)])
        vocab = tfidf.vocabulary_
        word_coord=dict((word,vocab[word]) for word in sentence )
        vectorized_paragraphe.append(np.average([freq_vec[0,word_coord[word]]*dico.get_vector(word) for word in sentence], axis=0))    
    vectorized_paragraphe=np.stack(vectorized_paragraphe+[np.zeros(size_embeding) for i in range (size_paragraph-len(paragraphe))],axis=0)
    return vectorized_paragraphe

## Construction d'un modèle d'analyse de sentiments

In [4]:
def build_model_sentiment( size_encoding, max_legnth_para, numClasses, lstmUnits):

    model=Sequential()
    model.add(LSTM(120,input_shape=(max_legnth_para,size_encoding), activation='relu', return_sequences=True ))
    model.add(Dropout(0.2))
    model.add(LSTM(120,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(50,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(numClasses,activation='sigmoid'))
    
    opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

    model.compile(
                loss='mean_squared_error',
                optimizer=opt,
                metrics=['mean_absolute_error'],)
    model=model
    return model

## Construction d'un autoencodeur

In [18]:
def build_auto(input_shape, code_size):
    input_img = Input(shape=(input_shape,))
    # "encoded" is the encoded representation of the input
    encoded = Dense(code_size, activation='relu')(input_img)
    # "decoded" is the lossy reconstruction of the input
    decoded = Dense(input_shape, activation='sigmoid')(encoded)
    
    autoencoder = Model(input_img,decoded)
    encoder = Model(input_img, encoded)
    
    encoded_input = Input(shape=(code_size,))
    decoder_layer = autoencoder.layers[-1]
    decoder = Model(encoded_input, decoder_layer(encoded_input))
    
    return autoencoder,encoder,decoder 

## Construction d'un autoencoder sequentiel

In [157]:
def build_seq_auto(size_sequence, size_embeding, code_size):
    input_img = Input(shape=(size_sequence, size_embeding))
    # "encoded" is the encoded representation of the input
    encoded = LSTM(code_size, activation='relu')(input_img)
    # "decoded" is the lossy reconstruction of the input
    decoded = RepeatVector(size_sequence)(encoded)
    decoded = LSTM(input_shape, activation='sigmoid', return_sequences= True)(decoded)
    decoded = TimeDistributed(Dense(size_embeding))(decoded)
    
    autoencoder = Model(input_img,decoded)
    encoder = Model(input_img, encoded)
    
    encoded_input = Input(shape=(code_size,))
    Layers=autoencoder.layers[-3:]
    Layers.reverse()
    decoder_layer = list_to_nn(Layers,encoded_input)
    decoder = Model(encoded_input, decoder_layer)
    
    return autoencoder,encoder,decoder

def list_to_nn(Layers,inputLayer):
    if Layers==[]:
        return inputLayer
    else :
        return Layers[0](list_to_nn(Layers[1:],inputLayer))

## Construction du modèle de recommendation


In [22]:
def build_model_recomendation(input_shape, output_shape, size_L1, size_L2, size_L3,size_L4):
    input_img = Input(shape = (input_shape,))
    # "encoded" is the encoded representation of the input
    Layer1 = Dense(size_L1, activation = 'relu')(input_img)
    Dropout_Layer1 = Dropout(0.1)(Layer1)
    Layer2 = Dense(size_L2, activation = 'relu')(Dropout_Layer1)
    Dropout_Layer2=Dropout(0.1)(Layer2)
    Layer3 = Dense(size_L3, activation = 'relu')(Dropout_Layer2)
    Dropout_Layer3=Dropout(0.1)(Layer3)
    Layer4 = Dense(size_L3, activation = 'relu')(Dropout_Layer3)
    output= Dense(output_shape, activation='softmax')(Layer4)

    model = Model(input_img, output)
    
    return model 

# Main

##  Extraction de la donnée textuelle 


In [22]:
band_info=pd.read_csv('band_content.csv', header = 0).drop(['biography_en'], axis=1)
influencer_info=pd.read_csv('influencer_content.csv', header = 0).drop(['description_en','preferences_en'], axis=1)
submission=pd.read_csv('submission_history.csv', header = 0)

band_info['biography_fr']=band_info['biography_fr'].fillna('').apply(func = lambda R : R.replace('\n',''))
influencer_info['description_fr']=influencer_info['description_fr'].fillna('').apply(func = lambda R : R.replace('\n',' '))
influencer_info['preferences_fr']=influencer_info['preferences_fr'].fillna('').apply(func = lambda R : R.replace('\n',' '))
submission['influencer_feedback']=submission['influencer_feedback'].fillna('').apply(func = lambda R : R.replace('\n',''))

band_info['biography_fr']=prepare_DF(band_info['biography_fr'])
influencer_info['description_fr']=prepare_DF(influencer_info['description_fr'])
influencer_info['preferences_fr']=prepare_DF(influencer_info['preferences_fr'])
submission['influencer_feedback']=prepare_DF(submission['influencer_feedback'])

band_bio=band_info['biography_fr']
band_bio.columns=['text']
influencer_bio=influencer_info['description_fr']
influencer_bio.columns=['text']
influencer_pref=influencer_info['preferences_fr']
influencer_pref.columns=['text']
submission_feedback=submission['influencer_feedback']
submission_feedback.columns=['text']
text=pd.concat([band_bio, influencer_bio, influencer_pref, submission_feedback], keys=['band_bio', 'influ_bio', 'influ_pref','feedback'])
List_sentences=prepare_sentences_List(text)
List_sentences_tfidf=[' '.join(sentence) for sentence in List_sentences]


Max_size_band_bio=max(text.loc['band_bio'].apply(func = lambda L : len(L)))
Max_size_influencer_bio=max(text.loc['influ_bio'].apply(func = lambda L : len(L)))
Max_size_influencer_pref=max(text.loc['influ_pref'].apply(func = lambda L : len(L)))
Max_size_feedback=max(text.loc['feedback'].apply(func = lambda L : len(L)))


## Création et entraînement des modèles de vectorisation du text

In [24]:
window_size=2
embeding_size=40

In [25]:
dico = generate_dict(List_sentences, window_size, embeding_size)
Dico = dico.wv

def tokenizer(string):
    return string.split(" ")

tfidf = TFIDF(analyzer = 'word', tokenizer = tokenizer)
tfidf.fit(List_sentences_tfidf)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenizer at 0x000000001ACE8C18>,
                use_idf=True, vocabulary=None)

In [26]:
"""Sauvegardede du dictionnaire word2vec et du modèle tf-idf"""
fname = get_tmpfile("wordvectors.kv")
Dico.save(fname)

with open('tfidf.pk', 'wb') as fin:
    pickle.dump(tfidf, fin)

In [27]:
"""
Chargement du dictionnaire word2vec
"""
fname = get_tmpfile("wordvectors.kv")
Dico = KeyedVectors.load(fname, mmap='r')

with open('tfidf.pk', 'rb') as fout:
    tfidf = pickle.load(fout) 

In [28]:
# Passage du text aux vecteurs
band_info['biography_fr'] = sentence_2v(band_info['biography_fr'],Dico,tfidf,embeding_size,Max_size_band_bio)
influencer_info['description_fr'] = sentence_2v(influencer_info['description_fr'],Dico,tfidf,embeding_size,Max_size_influencer_bio)
influencer_info['preferences_fr'] = sentence_2v(influencer_info['preferences_fr'],Dico,tfidf,embeding_size,Max_size_influencer_pref)
submission['influencer_feedback'] = sentence_2v(submission['influencer_feedback'],Dico,tfidf,embeding_size,Max_size_feedback)

In [29]:
# Sauvegarde de la version vectorisée de la donnée textuelle
band_info.to_pickle('band_info_Vectorized_text.csv')
influencer_info.to_pickle('influencer_info_Vectorized_text.csv')
submission.to_pickle('submission_Vectorized_text.csv')

## Stacking: encodage du feedback des influenceurs par la sortie d'un modèle d'analyse de sentiment  

In [5]:
num_Lstm=10
num_class=1
embeding_size=40
Xtrain=np.stack(submission['influencer_feedback'].tolist(),axis=0)
Ytrain=np.stack(submission['score'].tolist(),axis=0)
Model_sentiment=build_model_sentiment(embeding_size, Max_size_feedback, num_class, num_Lstm)
Model_sentiment.fit(Xtrain,Ytrain , epochs=3)

In [None]:
"""
Sauvegarde du modèle
"""

json_config = Model_sentiment.to_json()
with open('model_sentiment_config.json', 'w') as json_file:
    json_file.write(json_config)
# Save weights to disk
Model_sentiment.save_weights('Models/model_sentiment_weights.h5')

In [None]:
"""
Chargement du modèle
"""
with open('model_sentiment_config.json') as json_file:
    json_config = json_file.read()
Model_sentiment = tf.keras.models.model_from_json(json_config)
Model_sentiment.load_weights('Models/model_sentiment_weights.h5')

### Encodage du feedback grace au modèle d'analyse de sentiments

In [15]:
submission['influencer_feedback']=Model_sentiment.predict(np.stack(submission['influencer_feedback'].values.tolist()))
submission.to_pickle('submission_stacked.csv')


## Encodage des données grâce aux autoencodeurs

In [30]:
band_info=pd.read_pickle('band_info_Vectorized_text.csv')
band_bio=band_info['biography_fr']
influencer_info=pd.read_pickle('influencer_info_Vectorized_text.csv')
influencer_bio=influencer_info['description_fr']
submission=pd.read_pickle('submission_stacked.csv')

Max_size_band_bio=band_info['biography_fr'].iloc[0].shape[0]
Max_size_influencer_bio=influencer_info['description_fr'].iloc[0].shape[0]
Max_size_influencer_pref=influencer_info['preferences_fr'].iloc[0].shape[0]



### Données non séquentielles

Données représentant les artistes

In [122]:
"""
Création du modèle et des données de travail
"""
band_non_seq=band_info.drop(labels = ['id','band_id','biography_fr'], axis = 1)
Data_band=band_non_seq.values
input_shape=band_non_seq.shape[1]
code_size_band=20

band_auto,band_encoder,band_decoder= build_auto(input_shape,code_size_band)
band_auto.compile(optimizer='adam', loss='binary_crossentropy',metrics=['mean_absolute_error'])

In [123]:
#cross_val_score(band_auto, Data, Data, scoring = 'accuracy')

In [124]:
"""
Entraînement du modèle
"""
band_auto.fit(Data_band, Data_band,
                epochs=2,
                batch_size=20,
                shuffle=True,)

Train on 9502 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x3033fdbc8>

In [125]:
"""
Sauvegarde du modèle
"""
json_config = band_encoder.to_json()
with open('Models/band_encoder_config.json', 'w') as json_file:
    json_file.write(json_config)
# Save weights to disk
band_encoder.save_weights('Models/band_encoder_weights.h5')

In [126]:
"""
Chargement du modèle
"""
with open('Models/band_encoder_config.json') as json_file:
    json_config = json_file.read()
band_encoder = tf.keras.models.model_from_json(json_config)
band_encoder.load_weights('Models/band_encoder_weights.h5')

In [127]:
encoded_data_band = np.hstack((band_info['band_id'].values[...,np.newaxis],band_encoder.predict(np.stack(band_non_seq.values.tolist()))))
encoded_band = pd.DataFrame(encoded_data_band,columns=['band_id']+['Varirabl_band_'+str(i) for i in  range (code_size_band)])
encoded_band['band_id'] = encoded_band['band_id'].astype(int)

Données représentant les influenceurs

In [128]:
"""
Création du modèle et des données de travail
"""
influencer_non_seq=influencer_info.drop(labels = ['id','influencer_id','description_fr','preferences_fr'], axis = 1)
Data=band_non_seq.values
input_shape=band_non_seq.shape[1]
code_size_influ=20

influencer_auto,influencer_encoder,influencer_decoder= build_auto(input_shape,code_size_influ)
influencer_auto.compile(optimizer='adam', loss='binary_crossentropy',metrics=['mean_absolute_error'])

In [129]:
"""
Entraînement du modèle
"""
influencer_auto.fit(Data, Data,
                epochs=2,
                batch_size=20,
                shuffle=True,)

Train on 9502 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x61dab0ac8>

In [130]:
"""
Sauvegarde du modèle
"""
json_config = influencer_encoder.to_json()
with open('influencer_encoder_config.json', 'w') as json_file:
    json_file.write(json_config)
# Save weights to disk
influencer_encoder.save_weights('Models/influencer_encoder_weights.h5')

In [131]:
"""
Chargement du modèle
"""
with open('influencer_encoder_config.json') as json_file:
    json_config = json_file.read()
influencer_encoder = tf.keras.models.model_from_json(json_config)
influencer_encoder.load_weights('Models/influencer_encoder_weights.h5')

In [132]:
encoded_data_influencer = np.hstack((influencer_info['influencer_id'].values[...,np.newaxis],influencer_encoder.predict(np.stack(influencer_non_seq.values.tolist()))))
encoded_influencer = pd.DataFrame(encoded_data_influencer,columns=['influencer_id']+['Varirabl_influencer_'+str(i) for i in  range (code_size_influ)])
encoded_influencer['influencer_id'] = encoded_influencer['influencer_id'].astype(int)

### Données séquentielles

Boigraphie des artiste

In [133]:
"""
Création du modèle et des données de travail
"""
band_bio=band_info['biography_fr']
Data=np.stack(band_bio.tolist(),axis=0)
size_sequence=Max_size_band_bio
embeding_size=40
code_size_seq_band=100

band_seq_auto,band_seq_encoder,band_seq_decoder=build_seq_auto(size_sequence,embeding_size,code_size_seq_band)
band_seq_auto.compile(optimizer='adam', loss='mse',metrics=['mean_absolute_error'])

[<class 'tensorflow.python.keras.layers.recurrent_v2.LSTM'>, <class 'tensorflow.python.keras.layers.core.RepeatVector'>]
[<class 'tensorflow.python.keras.layers.core.RepeatVector'>]
[]


In [134]:
"""
Entraînement du modèle
"""
auto.fit(Data, Data,
                epochs=2,
                batch_size=10,
                shuffle=True)

Train on 9502 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x6c8a0dc8>

In [135]:
"""
Sauvegarde du modèle
"""
json_config = band_seq_encoder.to_json()
with open('Models/band_seq_encoder_config.json', 'w') as json_file:
    json_file.write(json_config)
# Save weights to disk
band_seq_encoder.save_weights('Models/band_seq_encoder_weights.h5')


In [137]:

"""
Chargement du modèle
"""
with open('Models/band_seq_encoder_config.json') as json_file:
    json_config = json_file.read()
band_seq_encoder = tf.keras.models.model_from_json(json_config)
band_seq_encoder.load_weights('Models/band_seq_encoder_weights.h5')

In [138]:
encoded_data_band_bio = np.hstack((band_info['band_id'].values[..., np.newaxis], band_seq_encoder.predict(np.stack(band_bio.values.tolist()))))
encoded_band_bio = pd.DataFrame(encoded_data_band_bio, columns=['band_id']+['Varirabl_desc_'+str(i) for i in  range (code_size_seq_band)])
encoded_band_bio['bandr_id'] = encoded_band_bio['band_id'].astype(int)

Descriptions des influenceurs 

In [139]:
"""
Création du modèle et des données de travail
"""
influencer_bio=influencer_info['description_fr']
Data=np.stack(influencer_bio.tolist(),axis=0)
size_sequence=Max_size_influencer_bio
embeding_size=40
code_size=100

influencer_seq_auto,influencer_seq_encode,influencer_seq_decode=build_seq_auto(size_sequence,embeding_size,code_size)
influencer_seq_auto.compile(optimizer='adam', loss='mse',metrics=['mean_absolute_error'])

[<class 'tensorflow.python.keras.layers.recurrent_v2.LSTM'>, <class 'tensorflow.python.keras.layers.core.RepeatVector'>]
[<class 'tensorflow.python.keras.layers.core.RepeatVector'>]
[]


In [141]:
"""
Entraînement du modèle
"""
influencer_seq_auto.fit(Data, Data,
                epochs=2,
                batch_size=10,
                shuffle=True)

Train on 1073 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x508faa88>

In [142]:
"""
Sauvegarde du modèle
"""
json_config = influencer_seq_encode.to_json()
with open('Models/influencer_seq_encoder_config.json', 'w') as json_file:
    json_file.write(json_config)
# Save weights to disk
influencer_seq_encode.save_weights('Models/influencer_seq_encoder_weights.h5')


In [143]:

"""
Chargement du modèle
"""
with open('Models/influencer_seq_encoder_config.json') as json_file:
    json_config = json_file.read()
influencer_seq_encode = tf.keras.models.model_from_json(json_config)
influencer_seq_encode.load_weights('Models/influencer_seq_encoder_weights.h5')

In [144]:
encoded_data_influencer_desc = np.hstack((influencer_info['influencer_id'].values[...,np.newaxis],influencer_seq_encode.predict(np.stack(influencer_bio.values.tolist()))))
encoded_influencer_desc = pd.DataFrame(encoded_data_influencer_desc, columns=['influencer_id']+['Varirabl_desc_'+str(i) for i in  range (code_size)])
encoded_influencer_desc['influencer_id'] = encoded_influencer_desc['influencer_id'].astype(int)

### Concatenation des données séquentielles et non séquentielles

In [147]:
"""
On concatène les tableaux contenant l'encodage des données non séquentille à ceux qui contiennent l'encodage des données
séquentielles.  
"""
encoded_influencer=pd.merge(encoded_influencer, encoded_influencer_desc, on = 'influencer_id')
encoded_band=pd.merge(encoded_band, encoded_band_bio, on = 'band_id')

encoded_band.to_pickle('encoded_band.csv')
encoded_influencer.to_pickle('encoded_influencer.csv')

## Prédiction du score

### Tentative de classification en utilisant XGBoost

In [None]:
encoded_band = pd.read_pickle('encoded_band.csv')
encoded_influencer = pd.read_pickle('encoded_influencer.csv')

submission = pd.read_pickle('submission_stacked.csv')[['score', 'band_id', 'influencer_id']]
one_hot_score = pd.get_dummies(submission['score'],prefix='score')
submission = pd.concat([submission,one_hot_score], axis = 1, sort = False).drop('score', axis = 1)

Data = submission.merge(encoded_band, on = ['band_id'], how = 'left')
Data = Data.merge(encoded_influencer, on = ['influencer_id'], how = 'left')


X=Data.drop(['band_id','influencer_id','score_0.0', 'score_0.25', 'score_0.5', 'score_1.0'], axis = 1)
Y=Data[['score_0.0', 'score_0.25', 'score_0.5','score_1.0']]
#Y=Data['score']
#X=Data.drop(['band_id','influencer_id','score'], axis = 1)
             
Xtest=X.iloc[70000:]
Ytest=Y.iloc[70000:]

X=X.iloc[:70000]
Y=Y.iloc[:70000]

In [16]:
def hyperopt_obj_XGB(params):
    max_depth=params['max_depth']
    learning_rate=params['learning_rate']
    n_estimators=params['n_estimators']
    min_child_weight=params['min_child_weight']
    model = xgb.XGBClassifier(objective = 'multi:softmax', num_class=4, 
                              max_depth = max_depth, 
                              learning_rate = 1/learning_rate,
                              n_estimators = n_estimators,
                              min_child_weight = min_child_weight,
                              )
    model.fit(X=X.values, y=Y.values, eval_set=[(Xtest.values,Ytest.values)])
    
    res = model.evals_result()
    del model
    return res

In [17]:
space={'max_depth':hp.choice('max_depth', [i for i in range(2,11)]),
    'learning_rate':hp.loguniform('learning_rate', 10, 10000),
    'n_estimators':hp.choice('n_estimators', [i for i in range (50,150)]),
    'min_child_weight':hp.choice('min_child_weight', [i for i in range (1,10)])}
fmin(hyperopt_obj_XGB, space = space, algo=tpe.suggest, max_evals=30)

[0]	validation_0-merror:0.34525                                                

[1]	validation_0-merror:0.34525                                                

[2]	validation_0-merror:0.34525                                                

[3]	validation_0-merror:0.34525                                                

[4]	validation_0-merror:0.34525                                                

[5]	validation_0-merror:0.34525                                                

[6]	validation_0-merror:0.34525                                                

[7]	validation_0-merror:0.34525                                                

[8]	validation_0-merror:0.34525                                                

[9]	validation_0-merror:0.34525                                                

[10]	validation_0-merror:0.34525                                               

[11]	validation_0-merror:0.34525                                               

[12]	validation_0-merror:0.3

KeyError: 'status'

In [4]:
model = xgb.XGBClassifier(objective = 'multi:softmax', num_class=4)
model.fit(X=X.values, y=Y.values, eval_set=[(Xtest.values,Ytest.values)])

[0]	validation_0-merror:0.324092
[1]	validation_0-merror:0.318328
[2]	validation_0-merror:0.312564
[3]	validation_0-merror:0.300744
[4]	validation_0-merror:0.303517
[5]	validation_0-merror:0.302787
[6]	validation_0-merror:0.29768
[7]	validation_0-merror:0.300817
[8]	validation_0-merror:0.298701
[9]	validation_0-merror:0.298701
[10]	validation_0-merror:0.291697
[11]	validation_0-merror:0.29177
[12]	validation_0-merror:0.29177
[13]	validation_0-merror:0.29177
[14]	validation_0-merror:0.29177
[15]	validation_0-merror:0.292135
[16]	validation_0-merror:0.291478
[17]	validation_0-merror:0.292208
[18]	validation_0-merror:0.292281
[19]	validation_0-merror:0.29177
[20]	validation_0-merror:0.291551
[21]	validation_0-merror:0.291113
[22]	validation_0-merror:0.29177
[23]	validation_0-merror:0.291113
[24]	validation_0-merror:0.291332
[25]	validation_0-merror:0.289216
[26]	validation_0-merror:0.285058
[27]	validation_0-merror:0.285131
[28]	validation_0-merror:0.282212
[29]	validation_0-merror:0.2816

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, num_class=4, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

### Tentative de classification en utilisant un réseau de neuronnes

In [None]:
encoded_band = pd.read_pickle('encoded_band.csv')
encoded_influencer = pd.read_pickle('encoded_influencer.csv')

submission = pd.read_pickle('submission_stacked.csv')[['score', 'band_id', 'influencer_id']]
one_hot_score = pd.get_dummies(submission['score'],prefix='score')
submission = pd.concat([submission,one_hot_score], axis = 1, sort = False).drop('score', axis = 1)

Data = submission.merge(encoded_band, on = ['band_id'], how = 'left')
Data = Data.merge(encoded_influencer, on = ['influencer_id'], how = 'left')


X=Data.drop(['band_id','influencer_id','score_0.0', 'score_0.25', 'score_0.5', 'score_1.0'], axis = 1)
Y=Data[['score_0.0', 'score_0.25', 'score_0.5','score_1.0']]
#Y=Data['score']
#X=Data.drop(['band_id','influencer_id','score'], axis = 1)
             
Xtest=X.iloc[70000:]
Ytest=Y.iloc[70000:]

X=X.iloc[:70000]
Y=Y.iloc[:70000]

In [26]:
input_shape = X.shape[1]
size_L1 = 341
size_L2 = 200
size_L3 = 100
size_L4 = 50
output_shape = 4

learning_rate = 0.1
momentum = 0.05 
epoch = 50 
decay_rate=learning_rate/epoch
sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)


Model_reco = build_model_recomendation(input_shape, output_shape, size_L1, size_L2, size_L3, size_L4)
Model_reco.compile(optimizer=sgd, loss='categorical_crossentropy',metrics=['accuracy'])

In [29]:
Model_reco.fit(X, Y,
                epochs=50,
                batch_size=100,
                shuffle=True)

Train on 70000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50


Epoch 4/50
Epoch 5/50


Epoch 6/50


Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


Epoch 16/50
Epoch 17/50


Epoch 18/50
Epoch 19/50


Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50


Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50


Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50


Epoch 33/50
Epoch 34/50


Epoch 35/50
Epoch 36/50


Epoch 37/50
Epoch 38/50


Epoch 39/50
Epoch 40/50


Epoch 41/50
Epoch 42/50


Epoch 43/50
Epoch 44/50


Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50


Epoch 49/50
Epoch 50/50




<tensorflow.python.keras.callbacks.History at 0x1405c988>