In [None]:
import findspark

In [None]:
findspark.init('/home/ubuntu/spark-3.3.0-bin-hadoop3')

In [None]:
import pyspark

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
 
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.0')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider')

# User 1
access_key = "AKIAQ6ET7AAE7KUI7PUU"
secret_key = "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"

# access_key = "AKIAQ6ET7AAESVDSBNPZ"
# secret_key = "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"

conf.set('spark.hadoop.fs.s3a.access.key', access_key)
conf.set('spark.hadoop.fs.s3a.secret.key', secret_key)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
import time
from pyspark.sql.functions import split, element_at, col, pandas_udf, PandasUDFType

from pyspark import SQLContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.image import ImageSchema

from pyspark.sql.functions import udf
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import *

## Chargement des images

In [None]:
def parse_categorie(path):
    '''Renvoie la catégorie d\'une image à partir de son chemin'''
    if len(path) > 0:
        #catégorie de l'image
        return path.split('/')[-2]
    else:
        return ''
    
def load_data(path_img):
    '''Chargement des dataframes: 
    Prend en entrée le répertoire qui contient les sous répertoires contenant les images
    Renvoie en sortie un spark dataframe contenant les images et 
    un spark dataframe contenant les noms des fruits associés'''
    #compteur
    start = time.time()
    #chargement dataframe des images

    df_img = spark.read.format("binaryFile").option("recursiveFileLookup","true").load(path_img) #.option("pathGlobFilter", "*.jpg") # ne fonctionne pas si il y a des espaces dans le chemin
    #df_img =  ImageSchema.readImages(path_img, dropImageFailures = True)
    print('Chargement effectué')
    #récupération chemin à partir des images
    df_img = df_img.withColumn("path", input_file_name())
    #catégorisation des images
    udf_categorie = udf(parse_categorie, StringType())
    df_img = df_img.withColumn('Catégorie', udf_categorie('path'))
    print('Temps de chargement des images : {} secondes'.format(time.strftime('%S', time.gmtime(time.time()-start))))
    
    return df_img

def preprocess_data(dataframe):
    '''Renvoie le résultat de l'avant dernière couche de chaque image du dataframe via transform du ResNet50
    return un df contenant des vecteurs de dimension 1x2048 '''
    
    from sparkdl import DeepImageFeaturizer
    # DeepImageFeaturizer Applies the model specified by its popular name, 
    # with its prediction layer(s) chopped off
    featurizer = DeepImageFeaturizer(inputCol="image", outputCol="image_preprocessed", modelName="VGG16")
    output = featurizer.transform(dataframe).select(['path', 'Catégorie', 'image_preprocessed'])
    del featurizer
    return output

In [None]:
path = 's3a://donnees-projet8/Images/'

In [None]:
spark_df = load_data(path)

In [None]:
spark_df.show()

In [None]:
from PIL import Image
import numpy as np
import io as io
import requests
from skimage.transform import resize

In [None]:
img = spark_df.select('content').collect()

In [None]:
first_img = img[0][0]
Image.open(io.BytesIO(first_img ))

In [None]:
spark_df.printSchema()

In [None]:
spark_df.show()

## Preprocessing

In [None]:
img = spark_df.select('content').collect()

In [None]:
binary_img_to_array = lambda rawdata: np.asarray(Image.open(io.BytesIO(rawdata[0])))

In [None]:
img_list = list(map(binary_img_to_array, img))

In [None]:
batch_image=[]
for i in range(len(img_list)):
    tmp_img=resize(img_list[i],output_shape=(224,224),order=2,anti_aliasing=True)
    tmp_img=np.expand_dims(tmp_img,axis=0)
    tmp_img=preprocess_input(tmp_img)
    batch_image.append(tmp_img)

## CNN

In [None]:
from keras.applications.vgg16 import VGG16 

In [None]:
#creation reseaux de neurones de mon modele de base
#taille de nos images 
IMG_SHAPE = (224,224,3)
base_model = VGG16(input_shape=IMG_SHAPE, include_top=False, pooling='avg', weights='imagenet')
#https://keras.io/api/applications/vgg/#vgg16-function
#input_shape=données d'entree
#include_top est ce que je veux rajouter quelque chose a la sortie rajouter dautres 
#couches
#pooling permet de limiter le nombre de features
#vgg16 sortait presque 100000 et faisait bugger l'acp

In [None]:
base_model.summary()

In [None]:
vgg16_feature=[]
for img in batch_image :
    feature_np = base_model.predict(img)
    vgg16_feature.append(feature_np.flatten())

In [None]:
vgg16_feature_np=np.array(vgg16_feature)
vgg16_feature_np.shape

In [None]:
spark_df.show()

## ACP

In [None]:
# fonction qui trace le graphique des eboulis des valeurs propres
def display_scree_plot(pca):
    scree = pca.explained_variance_ratio_*100
    #plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(), c="red", marker='.')
    plt.xlabel("rang de l'axe d'inertie")
    plt.ylabel("pourcentage d'inertie")
    plt.title("Eboulis des valeurs propres")
    plt.show(block=False)

In [None]:
from sklearn import decomposition

In [None]:
n_comp=0.99#nombre de composantes maximum a calculer pour l'acp

In [None]:
pca=decomposition.PCA(n_components=n_comp)

In [None]:
feat_pca= pca.fit(vgg16_feature_np)#projection

In [None]:
import matplotlib.pyplot as plt

In [None]:
display_scree_plot(pca)

In [None]:
pca.explained_variance_ratio_.cumsum()

In [None]:
print("Dimensions dataset avant réduction PCA : ", vgg16_feature_np.shape)
pca = decomposition.PCA(n_components=0.99)
feat_pca= pca.fit_transform(vgg16_feature_np)#projection
print("Dimensions dataset après réduction PCA : ", feat_pca.shape)

In [None]:
sc = spark.sparkContext

In [None]:
df = sc.parallelize(feat_pca).map(lambda x: [float(i) for i in x])\
        .toDF([str(i) for i in list(np.arange(10))])

In [None]:
df.show()