In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import efficientnet.tfkeras as efn
import os
import string
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import backend as K
import tensorflow.keras.layers as L

In [2]:
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' #gpu

In [3]:
print("pandas version: ", pd.__version__)
print("numpy version: ", np.__version__)
print("tensorflow version: ", tf.__version__)

pandas version:  1.2.2
numpy version:  1.19.5
tensorflow version:  2.4.1


In [4]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [5]:
test = pd.read_csv('test.csv')

In [6]:
def RemovePunctuation(text):
    if str(text)!='nan':
        text = text.lower()
        translator = str.maketrans('', '', string.punctuation)
        return text.translate(translator)
    else:
        return text

In [7]:
with open('Models_text/LabelEncoder.pickle', 'rb') as handle:
    le = pickle.load(handle)

In [8]:
test['description'] = test['description'].apply(lambda x: RemovePunctuation(x)) 

In [9]:
with open('Models_text/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [10]:
### text

In [11]:
max_len = 80
X_test = tokenizer.texts_to_sequences(test[test['description'].notnull()==True]['description'].tolist())
X_test = sequence.pad_sequences(X_test, maxlen=max_len, padding='post')

In [12]:
#получим прогнз по все фолдм
predicts = list()
for item in os.listdir("Models_text/"):
    if item.endswith(".h5"):
        print(f"Models_text/{item}")
        model = tf.keras.models.load_model(f"Models_text/{item}", custom_objects={'f1_m': f1_m})
        predicts.append(model.predict(X_test))

Models_text/model_text_fold_0.h5
Models_text/model_text_fold_1.h5
Models_text/model_text_fold_2.h5
Models_text/model_text_fold_3.h5
Models_text/model_text_fold_4.h5
Models_text/model_text_fold_5.h5
Models_text/model_text_fold_6.h5
Models_text/model_text_fold_7.h5
Models_text/model_text_fold_8.h5
Models_text/model_text_fold_9.h5


In [13]:
dictionary_predict_LSTM = dict(
                                        zip(
                                            test[test['description'].notnull()==True]['guid'].tolist(), 
                                            np.array(predicts).mean(axis=0)
                                           )
                                        )

### images

In [17]:
test['image_id'] = test['guid'] + '.jpg'

In [18]:
datagen = ImageDataGenerator(rescale=1. / 511)

In [19]:
list_images = list()
for filename in os.listdir("images"):
    if filename.endswith(".jpg"):
        list_images.append(filename)

In [20]:
test['have_image'] = test['image_id'].apply(lambda x: 1 if x in list_images else 0)

In [21]:
total_generator = datagen.flow_from_dataframe(
    dataframe=test.query("have_image==1"),
    directory="images",
    x_col="image_id",
    target_size=(512, 512),
    batch_size=8,
    class_mode=None,
    shuffle=False, 
    seed=42)

Found 1117 validated image filenames.


In [22]:
model = tf.keras.models.load_model(f"Models_images/long_models_for_inference/model_images_2.h5", custom_objects={'f1_m': f1_m})
predict = model.predict(total_generator)

In [23]:
dictionary_predict_images = dict(
                                zip(
                                    test[test['have_image']==1]['guid'].tolist(), 
                                    predict
                                    )
                                  )

In [24]:
def final_predict_typology(x):
    if str(x) in dictionary_predict_images.keys() and str(x) not in dictionary_predict_LSTM.keys():
        return le.inverse_transform([np.argmax(dictionary_predict_images[str(x)], axis=0)])[0]
    elif str(x) not in dictionary_predict_images.keys() and str(x) in dictionary_predict_LSTM.keys():
        return le.inverse_transform([np.argmax(dictionary_predict_LSTM[str(x)], axis=0)])[0]
    elif str(x) in dictionary_predict_images.keys() and str(x) in dictionary_predict_LSTM.keys():
        if dictionary_predict_LSTM[str(x)].max() >= 0.7:
            return le.inverse_transform([np.argmax(dictionary_predict_LSTM[str(x)], axis=0)])[0]
        elif dictionary_predict_images[str(x)].max() >= 0.7: 
            return le.inverse_transform([np.argmax(dictionary_predict_images[str(x)], axis=0)])[0]
        else:
            return le.inverse_transform([np.argmax(np.vstack((dictionary_predict_LSTM[str(x)], dictionary_predict_images[str(x)])).mean(axis=0), axis=0)])[0]
    else: return 'документы'

In [25]:
test['typology'] = test['guid'].apply(lambda x: final_predict_typology(x))

In [26]:
test[['guid', 'typology']].to_csv('submission.csv', index = False)

In [27]:
test.head(20)

Unnamed: 0,guid,description,typology,image_id,have_image
0,a74014d7-3054-45bd-b284-ac117ccd2001,,предметы нумизматики,a74014d7-3054-45bd-b284-ac117ccd2001.jpg,1
1,783eab83-a48c-4950-996a-386fe93a3ccd,образец волос удмурты,"предметы прикладного искусства, быта и этнографии",783eab83-a48c-4950-996a-386fe93a3ccd.jpg,1
2,8e7e2514-fab3-4287-be06-aae93c5397e0,,графика,8e7e2514-fab3-4287-be06-aae93c5397e0.jpg,1
3,dc1964ce-9248-4fe7-9d4b-c110ac270f64,фрески фрагмент фон из комплекта фресок фрагме...,живопись,dc1964ce-9248-4fe7-9d4b-c110ac270f64.jpg,1
4,1dd23c5e-bf5d-45b7-9894-07bfdf91ae80,афиша королевские цветы муз а рыбникова вороне...,документы,1dd23c5e-bf5d-45b7-9894-07bfdf91ae80.jpg,0
5,094e1f33-5488-43be-b7b7-e595a9a3a054,,предметы нумизматики,094e1f33-5488-43be-b7b7-e595a9a3a054.jpg,1
6,280d15c6-9331-4f7b-9a59-8e488f93d870,этюд пароход на реке,живопись,280d15c6-9331-4f7b-9a59-8e488f93d870.jpg,1
7,c3d3a4fc-c0bd-43e6-ac98-6c502bb7e793,фотооткрытка актер балакирев а в жизни,документы,c3d3a4fc-c0bd-43e6-ac98-6c502bb7e793.jpg,0
8,552acc89-76e2-4e1d-9035-9ef287ac600e,документ учетная карточка на мобилизованного к...,документы,552acc89-76e2-4e1d-9035-9ef287ac600e.jpg,1
9,2f281a43-71e7-450b-b4c6-0373bb944f17,,предметы печатной продукции,2f281a43-71e7-450b-b4c6-0373bb944f17.jpg,1
