# COLLAB DE MODELISATION

## Avant de commencer, modifier les constances de la première cellule

*   Avoir créé branche neuve depuis le master et pousser tout le code sur votre Drive
*   Créer des modèles texte, image et/ou fusion dans les fichiers 
  * Ecrire vos modèles dans ```src/models/models_text.py```, ```src/models/models_image.py``` et/ou ```src/models/models_fusion.py```
  * Ajouter-les dans `build_pipeline_model` dans `image.py`, `text.py` et/ou `fusion.py`
  * Modifier `MODEL_TEXT_NAME`, `MODEL_IMAGE_NAME` et/ou `MODEL_FUSION_NAME`
*   Modifier le chemin d'accès `PATH_PROJECT` vers votre dossier du projet à vous
*   Avoir les photos en `500x500`
  * Créer un fichier `.zip` sur votre ordi et y mettre toutes les photos du dossier ```images/image_train```
  * Uploader ce fichier sur votre Drive
  * Adapter ```LIEN_VERS_ZIP``` de la cellule ```shutil.unpack_archive```
  * Mettre la constante ```UNZIP``` à True
  * Vérifier que les photos sont bien dans ```xxx/data/raw/images/image_train```
* Exécuter tout code suivant



In [None]:
while True:pass

In [None]:
PATH_PROJECT = "/content/drive/MyDrive/DS-rakuten/Rakuten"

MODEL_TEXT_NAME = "nn_simple"
MODEL_IMAGE_NAME = "vgg16_transfer_learning"
MODEL_FUSION_NAME = "fusion_simple"

UNZIP = False #Pour dezipper les fichiers
CHECK_IMAGES = False # Pour verifier que tout le monde ait bien le bon nmbre d'images

TRAIN_IMAGE = True #Pour entrainer le modele d'image MODEL_IMAGE_NAME
TRAIN_TEXT = False #Pour entrainer le modele d'image MODEL_TEXT_NAME
TRAIN_FUSION = False #Pour entrainer le modele d'image MODEL_FUSION_NAME

BATCH_SIZE = 64
EPOCHS_IMAGE = 50
EPOCHS_TEXT = 5
TARGET_SHAPE = (56, 56, 3) #Taille néssaire pour VGG16

PATH_MODELS = "/content/drive/MyDrive/DS-rakuten/"

## CODE
### Mise en place

In [None]:
# Chargement du dashboard TensorBoard
%reload_ext tensorboard

In [None]:
# Ajout de Drive au syspath
from google.colab import drive
import sys

drive.mount('/content/drive', force_remount=True)
sys.path.insert(0, PATH_PROJECT)

In [None]:
# Ajout d'imports specifiques pour collab
!pip install unidecode
!pip install fasttext
!pip install googletrans

#Import des sources
from main import text_tools, image_tools, commons, graphs, model_tools, model_fusion, Path
import datetime
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from imblearn.metrics import classification_report_imbalanced
import joblib
import keras
import tensorflow as tf

PATH_RAW = Path(PATH_PROJECT, "data/raw")
PATH_BASE = Path(PATH_PROJECT, "data/base")
PATH_FEAT = Path(PATH_PROJECT, "data/featured")
PATH_TRANS = Path(PATH_PROJECT, "data/translated")
PATH_MODELS = Path(PATH_PROJECT, "src/models")

LOG_DIR_TEXT = Path(PATH_PROJECT, "logs/text", MODEL_TEXT_NAME, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
LOG_DIR_IMAGE = Path(PATH_PROJECT, "logs/image",MODEL_IMAGE_NAME, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
LOG_DIR_FUSION = Path(PATH_PROJECT, "logs/fusion",MODEL_FUSION_NAME, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))

In [None]:
#Dezippage des photos si necessaire
PATH_IMAGES_TRAIN = str(Path(PATH_RAW, "images/image_train"))
if UNZIP:
  import shutil 

  LIEN_VERS_ZIP = '/content/drive/Rakuten/Telechargement/test.zip'
  shutil.unpack_archive(LIEN_VERS_ZIP, PATH_IMAGES_TRAIN)

if CHECK_IMAGES:
  import os
  _, _, files = next(os.walk(PATH_IMAGES_TRAIN))
  file_count = len(files)
  assert file_count == 84916, f"ATTENTION, il manque des images sur les 84916 images requises, ou alors elles ne sont pas dans le bon dossier : les deplacer dans le dossier {PATH_IMAGES_TRAIN}"
  print(f"{file_count} dans votre dossier {PATH_IMAGES_TRAIN}")

In [None]:
# functions
def create_balanced_subset(df, samples_per_class, targetcol='target', savepath=None):
    """
    Extracts a balanced subset from given DataFrame by random downsampling each
    class to fixed specified number of observations
    """
    # Check
    condition = samples_per_class < df[targetcol].value_counts().min()
    error_msg = f"Number {samples_per_class} exceeds cardinal of smallest class"
    assert condition, error_msg
    # Go through all modalities
    idxs = [] # list of indexes to select
    for label in list(df[targetcol].unique()):
        # Indexes matching label
        idxs_label = list(df[df[targetcol] == label].index)
        # Sample random specimens
        np.random.shuffle(idxs_label)
        idxs_rand = idxs_label[:samples_per_class]
        # Add to list
        idxs += idxs_rand
    # Extract rows
    df = df.loc[df.index.intersection(idxs)]
    print("Balanced subset successfully created")
    if savepath:
        # Save subset
        df.to_csv(savepath, index=True, header=True)
        print(f"Dataframe saved to: '{savepath}'")
    return df

### Calculs

#### Preprocess

In [None]:
# Read and transform dataset 


# #Chargement des targets
# df_y = text_tools.read_csv(name="Y_train_CVw08PX.csv", folder=PATH_RAW)
# NB_SAMPLES = 5000
# #Chargement des données textuelles
# df_text = text_tools.read_csv(name="X_train_update.csv", folder=PATH_RAW)
# #df_y = df_y.loc[df_text.index]
# df_full = df_text.join(df_y)
# df_full.head()

# samples_per_class = 500
# print("samples_per_class", samples_per_class)
# df_balanced = create_balanced_subset(df_full, samples_per_class, targetcol='prdtypecode')
# df_balanced.head()

# df_y = df_balanced['prdtypecode']
# df_text = df_balanced.drop('prdtypecode', axis=1)

# len(df_text)

# #Application du pipeline de chargement
# pipeline_loader = text_tools.build_pipeline_load(path=str(PATH_RAW))
# df_text = pipeline_loader.fit_transform(df_text)

# #Transformations : recherche de la langue et traduction
# pipeline_lang = text_tools.build_pipeline_lang(translate=False)    #TODO régler le pb avec l'API de trad
# df_text = pipeline_lang.fit_transform(df_text)
# commons.save_pkl(df_text, name="df_text.pkl", folder=PATH_TRANS)


# Import training and testing data + rename label column
X_train = pd.read_csv(Path(PATH_RAW, './X_train_update.csv'), index_col=0)
y_train = pd.read_csv(Path(PATH_RAW, './Y_train_CVw08PX.csv'), index_col=0)
y_train = y_train.rename(columns={"prdtypecode": "label"})

#X_test = pd.read_csv(Path(PATH_RAW,'./X_test_update.csv'), index_col=0)

# Remove unecessary columns and create image id ['img_id']
X_train['filename'] = 'image_'+ X_train['imageid'].astype(str) + '_product_' + X_train['productid'].astype(str) + '.jpg'
del X_train['designation'], X_train['description'], X_train['productid'], X_train['imageid']

# X_test['filename'] = 'image_'+ X_test['imageid'].astype(str) + '_product_' + X_test['productid'].astype(str) + '.jpg'
# del X_test['designation'], X_test['description'], X_test['productid'], X_test['imageid']

print(X_train.shape)
print(y_train.shape)
# print(X_test.shape)




In [None]:
#Séparation des données en entrainement et test
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
#       df_text,
#       df_y,
#       test_size=0.2,
#      # stratify=df_y,
#       random_state=42
#     )
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import preprocess_input
img_gen = ImageDataGenerator(validation_split=0.2,
                             preprocessing_function = preprocess_input)

In [None]:
#Passage en Numpy
# X_train, y_train = X_train.values, y_train.values
# X_test, y_test = X_test.values, y_test.values

In [None]:
#Conversion des labels de la target
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y_train_tr = le.fit_transform(y_train)
# y_test_tr = le.transform(y_test)

# print(np.unique(y_train), "-->", np.unique(y_test_tr))


# Convert label from string to list of one element (required for the approach)
labels_list = []
for i in y_train['label']:
    label = []
    label.append(i)
    labels_list.append(label)
y_train['label'] = labels_list


X_train['label'] = y_train['label']

#### Modèle de texte

#### Modèle des images

In [None]:
#Chemin du modèle  
from tensorflow import keras
from keras.layers import Dense, Flatten, Cropping2D, Conv2D, BatchNormalization, MaxPool2D, Dropout, GlobalAveragePooling2D, Activation, MaxPooling2D
from keras.optimizers import Adam, SGD
from keras.models import Model, Sequential
from keras import backend as K
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping


MODEL_IMAGE_PATH = Path(PATH_MODELS, "image", MODEL_IMAGE_NAME)
MODEL_IMAGE_PATH.mkdir(parents=True, exist_ok=True)


#Génération des dataframes
# df_train = pd.DataFrame(
#             data=np.concatenate([X_train[:,0].reshape(-1, 1),
#                                  y_train.reshape(-1, 1)],
#                                 axis=1),
#             columns=["links", "label"])

# df_test = pd.DataFrame(
#             data=np.concatenate([X_test[:,0].reshape(-1, 1),
#                                  y_test_tr.reshape(-1, 1)], axis=1),
#             columns=["links", "label"])
        
# df_train["label"] = df_train["label"].apply(lambda x: str(x))
# df_test["label"] = df_test["label"].apply(lambda x: str(x))
        
       
# #Creations des générateurs
train_generator = img_gen.flow_from_dataframe(
    X_train,
    shuffle=True,
    directory=PATH_IMAGES_TRAIN,
    x_col='filename',
    y_col='label',
    class_mode='categorical',
    target_size=TARGET_SHAPE[2],
    batch_size=BATCH_SIZE,
    subset='training'
)

test_generator = img_gen.flow_from_dataframe(
    X_train,
    shuffle=False,
    directory=PATH_IMAGES_TRAIN,
    x_col='filename',
    y_col='label',
    class_mode='categorical',
    target_size=TARGET_SHAPE[2],
    batch_size=BATCH_SIZE,
    subset='validation'
)
  
    
# train_generator, test_generator = flow_generators(
#             df_train, df_test,
#             TARGET_SHAPE, BATCH_SIZE
#         )


if TRAIN_IMAGE:
  
  #Recuperation du modele
  # model_image = image_tools.build_pipeline_model(
  #               name=MODEL_IMAGE_NAME,
  #               input_dim=TARGET_SHAPE,
  #           )

  unfreezed_layers = 0 #Nombre de couches a décongeler pour aplique le finetuning: Voir livre Deep Learning with python
# Freezer les couches du VGG16
  n_class = 27
  base_model = VGG16(weights='imagenet', include_top=False, input_shape=TARGET_SHAPE) 
  # Freezer les couches du VGG16
  for layer in base_model.layers[-unfreezed_layers:]: 
    layer.trainable = False

  model = Sequential()
  model.add(base_model) # Ajout du modèle VGG16
  model.add(GlobalAveragePooling2D()) 
  model.add(Dense(1024, activation='relu'))
  model.add(Dropout(0.2))
  model.add(Dense(512, activation='relu'))
  model.add(Dropout(0.2))
  model.add(Dense(27, activation='softmax'))

  # Callbacks
  red_on_plateau = callbacks.ReduceLROnPlateau(monitor='val_accuracy',
                                               patience=2,
                                               factor=0.1,
                                               verbose=1)       

  early = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=20, verbose=1, mode='auto')
  
  adam = Adam(learning_rate=0.005)

  model.compile(
      optimizer=adam, 
      loss=keras.losses.categorical_crossentropy, 
      metrics=['accuracy'])
  
 
  
  #model.summary() 
  model_image = model
     
  #Lancement du Tensorboard
  
  %tensorboard --logdir $LOG_DIR_IMAGE
  
  #Entrainement et recuperation de l'historique
  model_image_hist = model_image.fit(
                train_generator,
                epochs= EPOCHS_IMAGE,
                #verbose=0,
                steps_per_epoch= 50, #len(df_train)//BATCH_SIZE,
                validation_data=test_generator,
                validation_steps = 5, #len(df_test)//BATCH_SIZE,
                callbacks=[
                        model_tools.get_model_checkpoint(MODEL_IMAGE_PATH), 
                        model_tools.get_dashboard(LOG_DIR_IMAGE),
                        model_tools.get_tqdm(),
                        red_on_plateau,
                        early
                        ]
            ) 

  if "history" in vars(model_image):
                plt.plot(model_image_hist.history['accuracy'])
                plt.plot(model_image_hist.history['val_accuracy'])
                plt.title('model accuracy')
                plt.ylabel('accuracy')
                plt.xlabel('epoch')
                plt.legend(['train', 'test'], loc='upper left')
                
                plt.savefig(Path(MODEL_IMAGE_PATH, 'history.jpg'))
                plt.clf()
#Sinon chargement d'un modele pour le texte
else:
  model_image = tf.keras.models.load_model(MODEL_IMAGE_PATH, compile=False)


#Prediction
y_image_preds = model_image.predict(test_generator)
y_image_preds_class = np.argmax(y_image_preds, axis=1) if len(y_image_preds.shape)==2 else y_image_preds

y_image_preds_class = le.inverse_transform(y_image_preds_class)

crosstab = pd.crosstab(y_test, y_image_preds_class, rownames=["Real"], colnames=["Predicted"])
print(classification_report_imbalanced(y_test, y_image_preds_class, zero_division=0))

heat = graphs.heatmap(crosstab)
plt.savefig(Path(MODEL_IMAGE_PATH, 'crosstab.jpg'))

In [None]:
model_image.summary()

In [None]:
train_acc = model_image_hist.history['accuracy']
val_acc = model_image_hist.history['val_accuracy']

plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.plot(np.arange(1 , EPOCHS_IMAGE+1, 1),
        model_image_hist.history['accuracy'],
         label = 'Training Accuracy',
         color = 'blue')

plt.plot(np.arange(1 , EPOCHS_IMAGE+1, 1),
         model_image_hist.history['val_accuracy'], 
         label = 'Validation Accuracy',
         color = 'red')


plt.legend()
plt.show()

In [None]:
from sklearn.metrics import f1_score

f1_score(test_generator.classes, y_image_preds.argmax(axis=1) , average="weighted")

# Nouvelle section

# Nouvelle section