In [1]:
import os
import numpy as np
import pandas as pd
import cv2
import xgboost
from xgboost import XGBClassifier
import mediapipe

## CARGA MODELOS

Colocar el nombre/ruta del modelo a probar

In [2]:
mp_holistic = mediapipe.solutions.holistic
mp_drawing = mediapipe.solutions.drawing_utils

model = XGBClassifier(eval_metric='mlogloss')

# Colocar el nombre del modelo a probar
model.load_model('../../02_modelos/XGBoostMuestreoRandom')

print(model.classes_)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


### UTILIDADES
Funciones para obtencion de puntos, dibujar puntos sobre imagenes

In [3]:
SIGNS_LIST = ['nacer','comida','brillante', 'mujer', 'hijo', 'hombre', 'lejos', 'aprender', 'espumadera','amargo','leche','Uruguay','pais','donde','ninguno','nombre','perfume','sordo','comprar','encontrar', 'nave espacial']
# signs_list = ['nacer']

# se toma la lista original de señas ya que estas en el orden en que se encuentran, su indice representa correctamente el nombre del video original
FULL_SIGNS_LIST = ['opaco', 'rojo', 'verde', 'amarillo', 'brillante', 'celeste', 'colores', 'rosa', 'mujer', 'enemigo', 'hijo', 'hombre', 'lejos','cajón','nacer','aprender','llamar','espumadera','amargo','dulce','leche','agua','comida','Argentina','Uruguay','pais','donde','apellido','burla','cumpleanos','desayuno','foto','hambre','mapa','moneda','musica','nave espacial','ninguno','nombre','paciencia','perfume','sordo','trampa','arroz','asado','caramelo','chicle','fideos','yogurt','aceptar','agradecer','apagar','aparecer','aterrizar','atrapar','ayudar','bailar','bañarse','comprar','copiar','correr','darse cuenta','dar','encontrar']

NUMBER_OF_PERSONS = 10
NUMBER_OF_VIDEOS_PER_PERSON = 5

frame_columns = []

AMOUNT_OF_FRAMES = 10
# amount_of_frames = 30

FACE_POINTS = 468
posePointIndexes = [k for k in range(23)] #de los 33 puntos solo tomamos los hombros, brazos, cabeza (SIN CINTURA debido a los videos del set de datos)
LEFT_HAND_POINTS = 21
RIGHT_HAND_POINTS = 21

# CAMBIAR ESTE BOOLEANO PARA TOMAR PUNTOS FACIALES O NO (se tomarán CEJAS y BOCA)
USE_FACE_POINTS = False

# https://github.com/tensorflow/tfjs-models/commit/838611c02f51159afdd77469ce67f0e26b7bbb23#diff-e5d31503f11c6bae62542ea89982152514b81906dff0b718e44708bcf22aa361
# https://github.com/ManuelTS/augmentedFaceMeshIndices/blob/master/Left_Eye.jpg
# https://github.com/google/mediapipe/blob/a908d668c730da128dfa8d9f6bd25d519d006692/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualization.png

# No he repetido los puntos que tenian en comun ciertos arreglos, como los puntos faciales en los datos
# obtenidos de los videos se encuentran primeros, estos indices se corresponden

rightEyebrowUpper = [156, 70, 63, 105, 66, 107, 55, 193]
rightEyebrowLower = [35, 124, 46, 53, 52, 65]

leftEyebrowUpper  = [383, 300, 293, 334, 296, 336, 285, 417]
leftEyebrowLower  = [265, 353, 276, 283, 282, 295]
    
lipsUpperOuter    = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291]
lipsLowerOuter    = [146, 91, 181, 84, 17, 314, 405, 321, 375]
lipsUpperInner    = [78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 308]
lipsLowerInner    = [95, 88, 178, 87, 14, 317, 402, 318, 324]

facePointsIndexes = rightEyebrowUpper + rightEyebrowLower + leftEyebrowUpper + leftEyebrowLower + lipsUpperOuter + lipsLowerOuter + lipsUpperInner + lipsLowerInner

for frame in range(AMOUNT_OF_FRAMES):
    
    if (USE_FACE_POINTS):
        for index in range (len(facePointsIndexes)):
            frame_columns.append(f'''fr_{frame}_face_p{index}_x''')
            frame_columns.append(f'''fr_{frame}_face_p{index}_y''')
    
    for index in range (len(posePointIndexes)):
        frame_columns.append(f'''fr_{frame}_pose_p{index}_x''')
        frame_columns.append(f'''fr_{frame}_pose_p{index}_y''')

    for index in range (LEFT_HAND_POINTS):
        frame_columns.append(f'''fr_{frame}_left_hand_p{index}_x''')
        frame_columns.append(f'''fr_{frame}_left_hand_p{index}_y''')

    for index in range (RIGHT_HAND_POINTS):
        frame_columns.append(f'''fr_{frame}_right_hand_p{index}_x''')
        frame_columns.append(f'''fr_{frame}_right_hand_p{index}_y''')

In [4]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)                 #prediction from a frame
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

# COLORES PARA PUNTOS Y LINEAS A DIBUJAR EN LA IMAGEN
leftHandStyles = [0,138,255,2,1] #naranja
rightHandStyles = [231,217,0,2,1] #celeste
faceStyles = [80,110,10,0,1]
poseStyles = [70,100,5,2,1]

def draw_styled_landmarks(image, results):
    #     FACE
    mp_drawing.draw_landmarks(
        image,
        results.face_landmarks,
        mp_holistic.FACEMESH_CONTOURS,
        mp_drawing.DrawingSpec(color=(faceStyles[0],faceStyles[1],faceStyles[2]), thickness=faceStyles[3], circle_radius=faceStyles[4])
    )
    #     POSE/BODY
    mp_drawing.draw_landmarks(
        image,
        results.pose_landmarks,
        mp_holistic.POSE_CONNECTIONS,
        mp_drawing.DrawingSpec(color=(poseStyles[0],poseStyles[1],poseStyles[2]), thickness=poseStyles[3], circle_radius=poseStyles[4])
    )
    #     LEFT HAND
    mp_drawing.draw_landmarks(
        image,
        results.left_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS,
        mp_drawing.DrawingSpec(color=(leftHandStyles[0],leftHandStyles[1],leftHandStyles[2]), thickness=leftHandStyles[3], circle_radius=leftHandStyles[4])
    )
    #     RIGHT HAND
    mp_drawing.draw_landmarks(
        image,
        results.right_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS,
        mp_drawing.DrawingSpec(color=(rightHandStyles[0],rightHandStyles[1],rightHandStyles[2]), thickness=rightHandStyles[3], circle_radius=rightHandStyles[4])
    )

def array_from_landmarks(results):
    # LEFT HAND array
    # len(results.left_hand_landmarks.landmark) = 21 landmarks for each hand, with 2 coordinates each landmark
    if (results.left_hand_landmarks):
        leftHandLandmarks = np.array([[result.x, result.y] for result in results.left_hand_landmarks.landmark]).flatten()
    else:
        leftHandLandmarks = np.zeros(21*2)


    # RIGHT HAND array
    # len(results.right_hand_landmarks.landmark) -> same for right hand
    if (results.right_hand_landmarks):
        rightHandLandmarks = np.array([[result.x, result.y ] for result in results.right_hand_landmarks.landmark]).flatten()
    else:
        rightHandLandmarks = np.zeros(21*2)


    # POSE array
    # len(results.pose_landmarks.landmark) -> 33 landmarks of 2 coordinates each one (X, Y)
    if (results.pose_landmarks):
        poseLandmarks = np.array([[result.x, result.y] for result in results.pose_landmarks.landmark]).flatten()
    else:
        poseLandmarks = np.zeros(33*2)


    # FACE array
    # len(results.face_landmarks.landmark) -> 468 landmarks of 2 coordinates each one
    if (results.face_landmarks):
        faceLandmarks = np.array([[result.x, result.y] for result in results.face_landmarks.landmark]).flatten()
    else:
        faceLandmarks = np.zeros(468*2)
        
    frame = np.concatenate((faceLandmarks, poseLandmarks, leftHandLandmarks, rightHandLandmarks))
    
    values_validation(frame)
    
    aux_array = []
    #CARA
    if (USE_FACE_POINTS):
        for faceKeypoint in facePointsIndexes:
            aux_array.append(frame[faceKeypoint*2])     # X
            aux_array.append(frame[faceKeypoint*2+1])   # Y
    #POSE
    for poseKeypoint in range(936, 936+len(posePointIndexes)*2):
        aux_array.append(frame[poseKeypoint])
    #MANOS
    for keypoint in range(1002, 1086):
        aux_array.append(frame[keypoint])
                
    return aux_array

def values_validation(frame):
    for i in range(len(frame)):
        if (frame[i] > 1):
            frame[i] = 1

### PRUEBA CON CAMARA EN TIEMPO REAL

In [5]:
caption = cv2.VideoCapture(0)
caption.set(cv2.CAP_PROP_FPS, 60)
resultsForVideo = []

framePointsLength = int(len(frame_columns)/AMOUNT_OF_FRAMES)
sentence = ""
i = 0
with mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as holisticModel:
    framesWindow = []
    while caption.isOpened():
        ret, frame = caption.read()
        i = i + 1
        image, results = mediapipe_detection(frame, holisticModel)
        draw_styled_landmarks(image, results)
        keypoints = array_from_landmarks(results)
        
        # SIN SALTO DE FRAMES
        framesWindow.extend(keypoints)

        #HACE FUNCION DE VENTANA DESLIZANTE DE FRAMES
        if (len(framesWindow) > len(frame_columns)):
            framesWindow = framesWindow[framePointsLength:]

        # PREDICE AL LLEGAR A 10 FRAMES
        if (len(framesWindow) >= len(frame_columns)):
            #if (lastFrameAdded == i - 1):
                #  res = model.predict(np.expand_dims(keypoints, axis=0))
            res = model.predict([framesWindow])
            resultsForVideo.append(res[0])
            
            sentence = SIGNS_LIST[res[0]]
        
        cv2.putText(image, ' '.join(sentence), (10,20), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (255,255,255), 2)
        cv2.putText(image, 'Frame °: ' + str(i), (10,50), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (255,255,255), 2)
        cv2.imshow('Predicting sign..', image)
        
        if cv2.waitKey(10) & 0xFF == ord('q'): #press 'q' to break
            break
            
    caption.release()
    cv2.destroyAllWindows()

### PRUEBA DESDE VIDEOS

Para la lista de señas se procesarán los videos, de ser necesario, se puede comentar los 3 bucles (for) y colocar el nombre/ruta del video especifico a procesar.

In [6]:
resultsIndexes = []
indexAndName = []
framePointsLength = int(len(frame_columns)/AMOUNT_OF_FRAMES)
lastFrameAdded = 0
with mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as holisticModel:
    for sign in range(len(FULL_SIGNS_LIST)):
        if (SIGNS_LIST.count(FULL_SIGNS_LIST[sign])):
            for j in range(NUMBER_OF_PERSONS):
                for k in range(NUMBER_OF_VIDEOS_PER_PERSON):
                    caption = cv2.VideoCapture(f'''C:/Users/facur/Desktop/tesis_LSA/codigos_datos_tesis/LSA64/all_cut/0{str(sign+1).zfill(2)}_0{str(j+1).zfill(2)}_00{k+1}.mp4''')
                    print(f'''C:/Users/facur/Desktop/tesis_LSA/codigos_datos_tesis/LSA64/all_cut/0{str(sign+1).zfill(2)}_0{str(j+1).zfill(2)}_00{k+1}.mp4''')
                    framesLength = int(caption.get(cv2.CAP_PROP_FRAME_COUNT))
                    i = 0
                    
                    framesWindow = []
                    resultsForVideo = []
                    while (caption.isOpened()):
                        ret, frame = caption.read()
                        if (ret):
                            image, results = mediapipe_detection(frame, holisticModel)
                            draw_styled_landmarks(image, results)
                            keypoints = array_from_landmarks(results)
                            
                            # SE PUEDE PROBAR SALTEANDO/SKIPPEANDO FRAMES DEL VIDEO (== pares) (!= impares)
                            # if (i % 2 == 0):
                                # framesWindow.extend(keypoints)
                                # lastFrameAdded = i
                                # i = i + 1
                            
                            # SIN SALTO DE FRAMES
                            framesWindow.extend(keypoints)
                            
                            #HACE FUNCION DE VENTANA DESLIZANTE DE FRAMES
                            if (len(framesWindow) > len(frame_columns)):
                                framesWindow = framesWindow[framePointsLength:]
                            
                            # PREDICE AL LLEGAR A 10 FRAMES
                            if (len(framesWindow) >= len(frame_columns)):
                                # if (lastFrameAdded == i - 1):
                                #  res = model.predict(np.expand_dims(keypoints, axis=0))
                                res = model.predict([framesWindow])
                                resultsForVideo.append(res[0])
                                # resultsIndexes.append(res[0])
                            cv2.imshow('Predicting sign..', image)
                        else:
                            print("Se acabó")
                            resultsIndexes.append(resultsForVideo)
                            indexAndName.append({'name': FULL_SIGNS_LIST[sign], 'index': SIGNS_LIST.index(FULL_SIGNS_LIST[sign])})
                            break
                        if cv2.waitKey(10) & 0xFF == ord('q'): #press 'q' to break
                            break
                    caption.release()
                    cv2.destroyAllWindows()

C:/Users/facur/Desktop/tesis_LSA/codigos_datos_tesis/LSA64/all_cut/005_001_001.mp4
C:/Users/facur/Desktop/tesis_LSA/codigos_datos_tesis/LSA64/all_cut/005_001_002.mp4


KeyboardInterrupt: 

In [None]:
for results in range(len(resultsIndexes)):
    resultsIndexes[results].sort()
    dicts = {}
    for element in resultsIndexes[results]:
        element = str(element)
        if(element in dicts):
            dicts[element] = dicts[element] + 1
        else:
            dicts[element] = 1
    print(resultsIndexes[results]) # Muestra el arreglo de predicciones realizadas para el video ordenadas por clase ascendentemente
    print(indexAndName[results]['index']) #Muestra el indice de la palabra en el arreglo SIGNS_LIST
    # Muestra-compara la cantidad de predicciones correctas contra el total de predicciones realizadas
    print(str(resultsIndexes[results].count(indexAndName[results]['index']))+'::'+str(len(resultsIndexes[results])))
    print(dicts) # Muestra diccionario de clases predichas y su cantidad para el video

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 15, 15, 18, 18]
2
51::56
{'2': 51, '6': 1, '15': 2, '18': 2}
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]
2
41::62
{'2': 41, '6': 4, '15': 17}
[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 15, 15]
2
55::68
{'1': 9, '2': 55, '6': 2, '15': 2}


In [None]:
caption.release()
cv2.destroyAllWindows()