In [None]:
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
#увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
#графики в svg выглядят более четкими
%config InlineBackend.figure_format = 'svg' 
%matplotlib inline

import tensorflow as tf

# Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from sklearn.preprocessing import LabelEncoder

# Data
#### TRAIN

In [None]:
DATA_PATH = '/kaggle/input/sf-dl-movie-genre-classification/'
PATH      = '/kaggle/working/'

In [None]:
train = pd.read_csv(DATA_PATH+'train.csv',)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.genre.value_counts().plot(kind='bar',figsize=(12,4),fontsize=10)
plt.xticks(rotation=60)
plt.xlabel("Genres",fontsize=10)
plt.ylabel("Counts",fontsize=10)

In [None]:
train.genre.nunique()

#### Test

In [None]:
test = pd.read_csv(DATA_PATH+'test.csv',)
test.head()

In [None]:
# Заэнкодим жанры

le = LabelEncoder()

train['genre'] = le.fit_transform(train['genre'])

train['genre']

## Text tokenization

In [None]:
import string
print(string.punctuation)

spec_chars = string.punctuation + '«»—…’‘”“©'
print(spec_chars)

# Зададим функцию для удаления спец символов
def remove_chars_from_text(text, chars):
    return "".join([ch for ch in text if ch not in chars])

In [None]:
# # Зададим функцию для токенизации рецензий
# def text_tokenizer(text):
#     Text_ = text.strip()

#     Text_ = Text_.lower()

#     # Удалим все спец символы
#     Text_ = remove_chars_from_text(Text_, spec_chars)

#     # Удалим все цифры
#     Text_ = remove_chars_from_text(Text_, string.digits)

#     Text_ = Text_.replace('\n',' ').replace('\t',' ')

#     # Токенизируем текст
#     tokens = word_tokenize(Text_)

#     # Список токенов преобразовываем к классу Text
#     token_text = nltk.Text(tokens)

#     # Удаляем стоп-слова
#     filtered_token_text = [w for w in token_text if not w in english_stopwords]
    
#     return filtered_token_text

In [None]:
# токенизируем рецензии

#train['text'] = train['text'].apply(lambda x: text_tokenizer(x))

В последствии отдельная токенизация не понадобилась, так как пользовался встроенным токенизатором BERTа

# MODEL 


In [None]:
# Задаем имя модели
model_name = 'bert-base-uncased'

# Максимальная длинная токенов
max_length = 100

# загружаем настройки и устанавливаем output_hidden_states - False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Грузим бертовский токенизатор
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

# Грузим модель
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

In [None]:
# Грузим основной слой
bert = transformer_model.layers[0]

# Выстраиваем входные данные
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}

# Грузим бертовскую модель как слой кераса
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Выстраиваем выходные данные
genre = Dense(units=len(train['genre'].value_counts()), activation = 'softmax',
              kernel_initializer=TruncatedNormal(stddev=config.initializer_range), 
              name='genre')(pooled_output)

outputs = {'genre': genre}

model_2 = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

model_2.summary()

In [None]:
# Задаем оптимайзер
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Задаем лосс-функцию и метрики
loss = {'genre': CategoricalCrossentropy(from_logits = True)}
metric = {'genre': CategoricalAccuracy('accuracy')}

# Компилируем модель
model_2.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Готовим выходные данные
y_genre = to_categorical(train['genre'])

# Токенизируем входные данные
x = tokenizer(
    text=train['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = False)

In [None]:
# Пошла жара
history = model_2.fit(
    x={'input_ids': x['input_ids']},
    y={'genre': y_genre},
    validation_split=0.2,
    batch_size=64,
    epochs=10)

In [None]:
model_2.save('keras_2st_softmax.h5')

In [None]:
# load_model = tf.keras.models.load_model('/kaggle/input/keras-model-1st/keras_1st.h5')

In [None]:
x_test = tokenizer(
    text=test['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = False)

In [None]:
predictions_2 = model_2.predict(x={'input_ids': x_test['input_ids']})
predictions_2

In [None]:
with open('pred_2.csv', 'w') as f:
    for key in predictions_2.keys():
        f.write("%s,%s\n"%(key,predictions_2[key]))

In [None]:
sub_list = []
for i in range(len(predictions_2['genre'])):
    genr = np.argmax(predictions_2['genre'][i])
    sub_list.append(genr)

In [None]:
pd.DataFrame(sub_list)

In [None]:
submission = pd.DataFrame(sub_list,
                          columns=['genre'])

In [None]:
submission = pd.read_csv('/kaggle/input/submission-2/submission_2st.csv')

In [None]:
submission

In [None]:
submission['genre'] = le.inverse_transform(submission['genre'])

In [None]:
submission = pd.DataFrame({'id':range(1, len(submission)+1),
                           'genre':submission['genre'].values},
                          columns=['id', 'genre'])


In [None]:
submission.to_csv('submission_new.csv', index=False)

In [None]:
submission

In [None]:
submission.to_csv('submission_2st.csv', index=False)
submission.head()

Точность пока не особо радует - лишь 0,65. 
Нужно еще покрутить fine-tuning и попробовать XLNet