In [29]:
# VEKTORIZACIJA

# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf(X):
    tfidf_vectorizer = TfidfVectorizer(norm='l2', max_features=200)
    tfidf_vectorizer.fit(X)
    tfidf_vectors = tfidf_vectorizer.transform(X)
    return tfidf_vectors

In [30]:
# TF-IDF atveju stemingas ir lemavimas (reikalingi failai gauti iš preprocess.ipynb):

import pandas as pd

stemming_df = pd.read_csv("cleaned_data/stemming.csv")
lemmatization_df = pd.read_csv("cleaned_data/lemmatization.csv")


In [31]:
stemming_df

Unnamed: 0,DESCRIPTION,GENRE
0,"['listen', 'convers', 'doctor', 'parent', '10y...",drama
1,"['brother', 'sister', 'past', 'incestu', 'rela...",thriller
2,"['bu', 'empti', 'student', 'field', 'trip', 'm...",adult
3,"['help', 'unemploy', 'father', 'make', 'end', ...",drama
4,"['film', 'titl', 'refer', 'unrecov', 'bodi', '...",drama
...,...,...
54081,"['shortliv', 'nbc', 'live', 'sitcom', 'center'...",comedy
54082,"['next', 'gener', 'exploit', 'sister', 'kapa',...",horror
54083,"['ze', 'bestaan', 'echt', 'standup', 'comedi',...",documentary
54084,"['walter', 'vivian', 'live', 'countri', 'diffi...",comedy


In [32]:
lemmatization_df

Unnamed: 0,DESCRIPTION,GENRE
0,"['listening', 'conversation', 'doctor', 'paren...",drama
1,"['brother', 'sister', 'past', 'incestuous', 'r...",thriller
2,"['bus', 'empty', 'student', 'field', 'trip', '...",adult
3,"['help', 'unemployed', 'father', 'make', 'end'...",drama
4,"['film', 'title', 'refers', 'unrecovered', 'bo...",drama
...,...,...
54081,"['shortlived', 'nbc', 'live', 'sitcom', 'cente...",comedy
54082,"['next', 'generation', 'exploitation', 'sister...",horror
54083,"['ze', 'bestaan', 'echt', 'standup', 'comedy',...",documentary
54084,"['walter', 'vivian', 'live', 'country', 'diffi...",comedy


In [33]:
tf_idf_vectors_stemming = tf_idf(stemming_df['DESCRIPTION'])

In [34]:
tf_idf_vectors_lemmatization = tf_idf(lemmatization_df["DESCRIPTION"])

In [35]:
# Stemingo atveju

print(tf_idf_vectors_stemming.toarray().shape) # Kokio dydzio matrica sukure?
print(tf_idf_vectors_stemming.nnz) # Skaiciu, kuriu yra ne nulis

(54086, 200)
727501


In [36]:
# Lematizavimo atveju

print(tf_idf_vectors_lemmatization.toarray().shape)
print(tf_idf_vectors_lemmatization.nnz) 

(54086, 200)
650859


Tekstas vektorizuotas. CNN atveju tikėsis 'array' tipo X, bei y reik konvertuot į atitinkamas one-hot encoded reikšmes
Kintamųjų pavadinimai toliau sutrumpinti: lem - lemmatization, stem - stemming

In [37]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse_output=False)

X_lem = tf_idf_vectors_lemmatization.toarray()
y_lem = onehot_encoder.fit_transform(lemmatization_df["GENRE"].values.reshape(-1, 1))

X_stem = tf_idf_vectors_stemming.toarray()
y_stem = onehot_encoder.fit_transform(stemming_df["GENRE"].values.reshape(-1, 1))


In [38]:
# Is tikruju zanrai siuo atveju turetu but taip pat užkoduoti, bet patikrinu:
import numpy as np
are_equal = np.array_equal(y_stem, y_lem)
print(f"Ar žanrai taip pat užkoduoti: {are_equal}")

Ar žanrai taip pat užkoduoti: True


Dabar turime lematizuotus ir steminguotus vektorius X, bei žanrų masyvus y, paruoštus modelio mokymui.

LEMATIZAVIMO atveju:

In [39]:
import numpy as np

# Pakeičiama forma TF-IDF vektorių į 3D, nes to tikėsis CNN
X_lem_reshaped = X_lem.reshape((X_lem.shape[0], X_lem.shape[1], 1))

In [40]:
from sklearn.model_selection import train_test_split
X_lem_train, X_lem_test, y_lem_train, y_lem_test = train_test_split(X_lem_reshaped, y_lem, test_size=0.2, random_state=42)

In [41]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Input

# Apsirašomas CNN modelis
model_lem_tftidf_cnn  = Sequential([
    Input(shape=(X_lem_train.shape[1], 1)), 
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.5),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(y_lem_train.shape[1], activation='softmax')
])

# Sukompiliuojamas modelis
model_lem_tftidf_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [42]:
from tensorflow.keras.callbacks import EarlyStopping

# Apsirasomas ankstyvas mokymosi sustabdymas.
early_stopping = EarlyStopping(
    monitor='val_loss',  # Galima naudot ir 'val_accuracy'
    patience=3,  # Kai tiek epochų modelis nebetobuleja - bus sustabdytas mokymasis
    restore_best_weights=True
)

In [43]:
# Modelis mokosi apie 4 min
history_lem_tfidf_cnn = model_lem_tftidf_cnn.fit(X_lem_train, y_lem_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 23ms/step - accuracy: 0.3577 - loss: 2.2806 - val_accuracy: 0.4219 - val_loss: 1.9784
Epoch 2/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4125 - loss: 2.0257 - val_accuracy: 0.4312 - val_loss: 1.9249
Epoch 3/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4217 - loss: 1.9765 - val_accuracy: 0.4288 - val_loss: 1.9120
Epoch 4/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4186 - loss: 1.9628 - val_accuracy: 0.4366 - val_loss: 1.9122
Epoch 5/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 24ms/step - accuracy: 0.4236 - loss: 1.9490 - val_accuracy: 0.4381 - val_loss: 1.8871
Epoch 6/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4302 - loss: 1.9215 - val_accuracy: 0.4369 - val_loss: 1.8866
Epoc

In [44]:
# Kad nereiktu kiekviena kart leisti modelio mokymosi (auksciau esanciam bloke), galima ji issaugoti:
# Turi but uztikrinta kad tame failo direktorijoje yra aplankalas 'models'
model_lem_tftidf_cnn.save('models/lem_tfidf_cnn.keras')


In [45]:
# Pavizdys, kaip reiketu apmokyta modeli uzkrauti

from tensorflow.keras.models import load_model

model_lem_tftidf_cnn = load_model('models/lem_tfidf_cnn.keras')
model_lem_tftidf_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # iš naujo kompiliuojame su optimizatoriumi

In [46]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Spetos etiketes (skaiciukai atitikantys zanrus)
y_pred = model_lem_tftidf_cnn.predict(X_lem_test)
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_lem_test.argmax(axis=1) 

accuracy_f1_lem_tfidf_cnn = round(accuracy_score(y_true_classes, y_pred_classes), 3)
f1_lem_tfidf_cnn = round(f1_score(y_true_classes, y_pred_classes, average='weighted'), 3)

print("Lemavimas + TF-IDF + CNN")
print(f"F1 Score: {f1_lem_tfidf_cnn}")
print(f"Tikslumas: {accuracy_f1_lem_tfidf_cnn}")


[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
Lemavimas + TF-IDF + CNN
F1 Score: 0.354
Tikslumas: 0.443


Tai Lemavimo + TF-IDF + CNN atveju gauname:
Tikslumas: 0.443
F1: 0.354

Toliau stemingas + TF-IDF + CNN:

In [19]:
import numpy as np

# Pakeičiama forma TF-IDF vektorių į 3D, nes to tikėsis CNN
X_stem_reshaped = X_stem.reshape((X_stem.shape[0], X_stem.shape[1], 1))

In [47]:
from sklearn.model_selection import train_test_split
X_stem_train, X_stem_test, y_stem_train, y_stem_test = train_test_split(X_stem_reshaped, y_stem, test_size=0.2, random_state=42)

In [48]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Input

# Apsirašomas CNN modelis
model_stem_tfidf_cnn = Sequential([
    Input(shape=(X_stem_train.shape[1], 1)), 
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.5),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(y_stem_train.shape[1], activation='softmax')
])

# Sukompiliuojamas modelis
model_stem_tfidf_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [49]:
from tensorflow.keras.callbacks import EarlyStopping

# Apsirasomas ankstyvas mokymosi sustabdymas.
early_stopping = EarlyStopping(
    monitor='val_loss',  # Galima naudot ir 'val_accuracy'
    patience=3,  # Kai tiek epochų modelis nebetobuleja - bus sustabdytas mokymasis
    restore_best_weights=True
)

In [50]:
# Modelis mokosi apie 4 min
history_stem_tfidf_cnn = model_stem_tfidf_cnn.fit(X_stem_train, y_stem_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 23ms/step - accuracy: 0.3172 - loss: 2.4076 - val_accuracy: 0.4188 - val_loss: 1.9989
Epoch 2/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4115 - loss: 2.0535 - val_accuracy: 0.4299 - val_loss: 1.9221
Epoch 3/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4271 - loss: 1.9846 - val_accuracy: 0.4330 - val_loss: 1.8925
Epoch 4/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4221 - loss: 1.9626 - val_accuracy: 0.4347 - val_loss: 1.8877
Epoch 5/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4227 - loss: 1.9610 - val_accuracy: 0.4371 - val_loss: 1.8771
Epoch 6/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4311 - loss: 1.9286 - val_accuracy: 0.4408 - val_loss: 1.8712
Epoc

In [51]:
# Taip pat issaugomas modelis, kad kiekviena kart jo nereiketu mokyti:
model_stem_tfidf_cnn.save('models/stem_tfidf_cnn.keras')

In [52]:
from tensorflow.keras.models import load_model

model_stem_tfidf_cnn = load_model('models/stem_tfidf_cnn.keras')

In [53]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

y_pred = model_stem_tfidf_cnn.predict(X_stem_test)
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_stem_test.argmax(axis=1) 

f1_stem_tfidf_cnn = round(f1_score(y_true_classes, y_pred_classes, average='weighted'), 3)
accuracy_stem_tfidf_cnn = round(accuracy_score(y_true_classes, y_pred_classes), 3)


print("Stemingas + TF-IDF + CNN")
print(f"F1 Score: {f1_stem_tfidf_cnn}")
print(f"Tikslumas: {accuracy_stem_tfidf_cnn}")

[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
Stemingas + TF-IDF + CNN
F1 Score: 0.339
Tikslumas: 0.439


Gautos stemingo + TF-IDF + cnn metrikos:
Tikslumas: 0.439
F1: 0.339

Buvo Lemavimo + TF-IDF + CNN atveju:
Tikslumas: 0.443
F1: 0.354

Tad skirtumas ar lemavimas ar stemingas buvo naudotas CNN atveju nesudarė didelių skirtumų, kad limituoti naudojamus variantus, analizė bus tesiama su stemingu.

In [54]:
# SVC (Support vector classifier) modelis

In [55]:
# Is naujo apsirasau, nes SVC reikalauja kitokio input nei CNN atveju.

from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("cleaned_data/stemming.csv")
X = tf_idf(stemming_df['DESCRIPTION'])
y = stemming_df['GENRE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
from sklearn.svm import SVC

model_stem_tfidf_svc = SVC()
history_stem_tfidf_svc = model_stem_tfidf_svc.fit(X_train, y_train)

In [57]:
# issaugojimas modelio:
# svc neturi atributo save, tai reikia kitaip:

from joblib import dump
dump(model_stem_tfidf_svc, 'models/model_stem_tfidf_svc.joblib')

['models/model_stem_tfidf_svc.joblib']

In [58]:
from joblib import load

model_stem_tfidf_svc = load('models/model_stem_tfidf_svc.joblib')

In [59]:
y_pred = model_stem_tfidf_svc.predict(X_test)
y_pred

array([' drama ', ' drama ', ' documentary ', ..., ' drama ',
       ' documentary ', ' documentary '], dtype=object)

In [60]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

f1_stem_tfidf_svc = round(f1_score(y_test, y_pred, average='weighted'), 3)
accuracy_stem_tfidf_svc = round(accuracy_score(y_test, y_pred), 3)

print(f"Tikslumas: {accuracy_stem_tfidf_svc}, F1: {f1_stem_tfidf_svc}")

Tikslumas: 0.471, F1: 0.402


Paprasto SVC atveju gavome geresni tikslumas ir F1 nei baziniu CNN. (0.471 > 0.443, 0.402 > 0.354)

LSTM modelis.

In [61]:
# Is naujo apsirasau, nes LSTM reikalauja kitokio input jei CNN ir SVC atveju.

from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("cleaned_data/stemming.csv")
X = tf_idf(stemming_df['DESCRIPTION'])

In [62]:
X = X.toarray()

In [63]:
# LSTM modeliui reikalinga ivestis 
X = X.reshape((X.shape[0], 1, X.shape[1]))

In [64]:
y_encoded = onehot_encoder.fit_transform(stemming_df["GENRE"].values.reshape(-1, 1))

In [65]:
X.shape

(54086, 1, 200)

In [66]:
y_encoded.shape

(54086, 27)

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [70]:

from tensorflow.keras.layers import LSTM, Dense, SpatialDropout1D

model_stem_tfidf_lstm = Sequential()
model_stem_tfidf_lstm.add(SpatialDropout1D(0.2))
model_stem_tfidf_lstm.add(LSTM(100, dropout=0.1, recurrent_dropout=0.2))
model_stem_tfidf_lstm.add(Dense(27, activation='softmax'))
model_stem_tfidf_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [71]:
from tensorflow.keras.callbacks import EarlyStopping

callback = [EarlyStopping(monitor='val_loss', min_delta=0, patience=2, mode='auto')]
history_stem_tfidf_lstm = model_stem_tfidf_lstm.fit(X_train, y_train, epochs=10, batch_size=32,validation_data=(X_test,y_test), callbacks=callback)

Epoch 1/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.3660 - loss: 2.3756 - val_accuracy: 0.4460 - val_loss: 1.8896
Epoch 2/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4304 - loss: 1.9382 - val_accuracy: 0.4640 - val_loss: 1.8119
Epoch 3/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4390 - loss: 1.8870 - val_accuracy: 0.4627 - val_loss: 1.7776
Epoch 4/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4371 - loss: 1.8444 - val_accuracy: 0.4658 - val_loss: 1.7672
Epoch 5/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.4440 - loss: 1.8295 - val_accuracy: 0.4698 - val_loss: 1.7568
Epoch 6/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.4425 - loss: 1.8302 - val_accuracy: 0.4684 - val_loss: 1.7527
Epoch 7/10
[1m1

In [72]:
y_pred = model_stem_tfidf_lstm.predict(X_test)
y_pred

[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


array([[0.17858176, 0.00246581, 0.03932749, ..., 0.0596112 , 0.00647582,
        0.01319527],
       [0.00949327, 0.00295327, 0.01420088, ..., 0.01966343, 0.00065109,
        0.0016308 ],
       [0.00972187, 0.00437902, 0.00543028, ..., 0.02277772, 0.0007758 ,
        0.00429612],
       ...,
       [0.00866502, 0.00712521, 0.01910072, ..., 0.00987565, 0.00668057,
        0.00145104],
       [0.07787254, 0.00074805, 0.02200148, ..., 0.01088151, 0.00235594,
        0.01125201],
       [0.00697261, 0.00420403, 0.00578174, ..., 0.0062598 , 0.00038751,
        0.00057033]], dtype=float32)

In [73]:
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_test.argmax(axis=1) 

In [74]:
f1_stem_tfidf_lstm = round(f1_score(y_true_classes, y_pred_classes, average='weighted'), 3)
accuracy_stem_tfidf_lstm = round(accuracy_score(y_true_classes, y_pred_classes), 3)

print(f"Tikslumas: {accuracy_stem_tfidf_lstm}, F1: {f1_stem_tfidf_lstm}")

Tikslumas: 0.47, F1: 0.397


In [None]:
# LSTM suveike labai panasiai kaip SVC siuo atveju.

DABAR tie patys trys modeliai CNN, LSTM, SVC bandomi su stemingu + word2vec

In [75]:
from gensim.models import Word2Vec
import pandas as pd

df = pd.read_csv("cleaned_data/stemming.csv")

In [76]:
import ast

def string_to_lists(row):
    try:
        row = ast.literal_eval(row)
    except:
        pass
    return row

In [77]:
descriptions = df['DESCRIPTION'].apply(string_to_lists)

In [78]:
word2vec_model = Word2Vec(descriptions, vector_size=200, window=10, min_count=3, workers=4)

In [79]:
word_vectors = word2vec_model.wv

In [80]:
word_vectors.key_to_index

{'life': 0,
 'one': 1,
 'film': 2,
 'live': 3,
 'stori': 4,
 'find': 5,
 'world': 6,
 'year': 7,
 'new': 8,
 'get': 9,
 'love': 10,
 'take': 11,
 'famili': 12,
 'young': 13,
 'time': 14,
 'two': 15,
 'make': 16,
 'friend': 17,
 'man': 18,
 'work': 19,
 'peopl': 20,
 'day': 21,
 'becom': 22,
 'come': 23,
 'father': 24,
 'show': 25,
 'way': 26,
 'girl': 27,
 'first': 28,
 'home': 29,
 'tri': 30,
 'also': 31,
 'meet': 32,
 'go': 33,
 'follow': 34,
 'back': 35,
 'help': 36,
 'documentari': 37,
 'old': 38,
 'mother': 39,
 'woman': 40,
 'want': 41,
 'son': 42,
 'look': 43,
 'citi': 44,
 'like': 45,
 'turn': 46,
 'decid': 47,
 'end': 48,
 'wife': 49,
 'set': 50,
 'three': 51,
 'school': 52,
 'see': 53,
 'start': 54,
 'begin': 55,
 'town': 56,
 'place': 57,
 'use': 58,
 'hous': 59,
 'tell': 60,
 'chang': 61,
 'person': 62,
 'music': 63,
 'daughter': 64,
 'even': 65,
 'discov': 66,
 'leav': 67,
 'mani': 68,
 'kill': 69,
 'return': 70,
 'forc': 71,
 'play': 72,
 'journey': 73,
 'death': 74,
 'wo

In [81]:
document_embeddings = []
for tokens in descriptions:
    word_embeddings = [word_vectors[word] for word in tokens if word in word_vectors]
    if word_embeddings:
        document_embedding = sum(word_embeddings) / len(word_embeddings)
    else:
        document_embedding = [0.0] * 200
    document_embeddings.append(document_embedding)

In [82]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_document_embeddings = scaler.fit_transform(document_embeddings)

In [83]:
X = scaled_document_embeddings
y = onehot_encoder.fit_transform(stemming_df["GENRE"].values.reshape(-1, 1))

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Stemming + Word2Vec + CNN

In [85]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# Apsirašomas CNN modelis 
model_stem_wordvec_cnn = Sequential([
    Input(shape=(X_train.shape[1], 1)),  
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.5),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(y_train.shape[1], activation='softmax')
])

# Sukompiliuojamas modelis
model_stem_wordvec_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [86]:
from tensorflow.keras.callbacks import EarlyStopping

# Apsirasomas ankstyvas mokymosi sustabdymas.
early_stopping = EarlyStopping(
    monitor='val_loss',  # Galima naudot ir 'val_accuracy'
    patience=3,  # Kai tiek epochų modelis nebetobuleja - bus sustabdytas mokymasis
    restore_best_weights=True
)

In [87]:
# Modelis mokosi apie 4 min
history_stem_wordvec_cnn = model_stem_wordvec_cnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 23ms/step - accuracy: 0.3597 - loss: 2.2494 - val_accuracy: 0.4839 - val_loss: 1.8766
Epoch 2/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4467 - loss: 1.9258 - val_accuracy: 0.4920 - val_loss: 1.7373
Epoch 3/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4553 - loss: 1.8799 - val_accuracy: 0.4993 - val_loss: 1.7428
Epoch 4/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4603 - loss: 1.8331 - val_accuracy: 0.5088 - val_loss: 1.6843
Epoch 5/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4676 - loss: 1.8053 - val_accuracy: 0.5037 - val_loss: 1.6661
Epoch 6/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - accuracy: 0.4694 - loss: 1.7942 - val_accuracy: 0.5049 - val_loss: 1.6463
Epoc

In [88]:
# issaugojimas
model_stem_wordvec_cnn.save('models/model_stem_wordvec_cnn.keras')

In [89]:
# Uzkrovimas issaugoto modelio

from tensorflow.keras.models import load_model
model_stem_wordvec_cnn = load_model('models/model_stem_wordvec_cnn.keras')

In [90]:
y_pred = model_stem_wordvec_cnn.predict(X_test)
y_pred

[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


array([[9.4410613e-02, 2.9639041e-03, 3.0168671e-02, ..., 7.8234002e-02,
        3.5943172e-03, 1.7354399e-03],
       [4.4781920e-03, 1.3830594e-03, 8.6351596e-03, ..., 6.4346604e-03,
        1.4277286e-03, 3.7453757e-04],
       [1.1521258e-03, 9.0423319e-03, 6.0910205e-03, ..., 4.3110582e-03,
        6.3256957e-05, 2.2181788e-05],
       ...,
       [3.6405330e-03, 1.9913728e-03, 5.7395748e-03, ..., 2.7241418e-03,
        1.3934788e-03, 2.4272301e-04],
       [4.9695000e-02, 6.2846212e-04, 2.7148476e-02, ..., 2.1305813e-02,
        1.4765844e-02, 5.1160357e-03],
       [5.8731274e-04, 1.7601098e-05, 1.5449696e-03, ..., 4.6609747e-04,
        2.6799407e-04, 2.3109119e-06]], dtype=float32)

In [91]:
# Konvertuojam i daugiausia procentu duotas klases
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_test.argmax(axis=1) 

In [92]:
f1_stem_wordvec_cnn = round(f1_score(y_true_classes, y_pred_classes, average='weighted'), 3)
accuracy_stem_wordvec_cnn = round(accuracy_score(y_true_classes, y_pred_classes), 3)

print(f"Tikslumas: {accuracy_stem_wordvec_cnn}, F1: {f1_stem_wordvec_cnn}")

Tikslumas: 0.512, F1: 0.427


Word2VEC labai pagerino modelio veikima, siuo atveju tai veike geriau nei visi bandyti CNN, SVC, LSTM su tf-idf vektorizacija.

Stemming + Word2Vec + SVC

In [93]:
from sklearn.model_selection import train_test_split
import pandas as pd

# SVC atveju nereikia encodinti zanru.
y = stemming_df['GENRE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
from sklearn.svm import SVC

model_stem_wordvec_svc = SVC()
history_stem_wordvec_svc = model_stem_wordvec_svc.fit(X_train, y_train)

In [95]:
from joblib import dump
dump(model_stem_wordvec_svc, 'models/model_stem_wordvec_svc.joblib')

['models/model_stem_wordvec_svc.joblib']

In [96]:
model_stem_wordvec_svc = load('models/model_stem_wordvec_svc.joblib')

In [97]:
y_pred = model_stem_wordvec_svc.predict(X_test)
y_pred

array([' action ', ' drama ', ' short ', ..., ' drama ', ' documentary ',
       ' documentary '], dtype=object)

In [98]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

f1_stem_wordvec_svc = round(f1_score(y_test, y_pred, average='weighted'), 3)
accuracy_stem_wordvec_svc = round(accuracy_score(y_test, y_pred), 3)

print(f"Tikslumas: {accuracy_stem_wordvec_svc}, F1: {f1_stem_wordvec_svc}")

Tikslumas: 0.568, F1: 0.522


Gauti žymiai geresni rezultatai SVC su word2vec atveju nei prieš tai buve.

Stemming + Word2Vec + LSTM 

In [99]:
y = onehot_encoder.fit_transform(stemming_df["GENRE"].values.reshape(-1, 1))

In [100]:
print(X.shape, y.shape)

(54086, 200) (54086, 27)


In [101]:
X = X.reshape((X.shape[0], 1, X.shape[1]))

In [102]:
X.shape

(54086, 1, 200)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
from tensorflow.keras.layers import LSTM, Dense, SpatialDropout1D

model_stem_wordvec_lstm = Sequential()
model_stem_wordvec_lstm.add(SpatialDropout1D(0.2))
model_stem_wordvec_lstm.add(LSTM(100, dropout=0.1, recurrent_dropout=0.2))
model_stem_wordvec_lstm.add(Dense(27, activation='softmax'))
model_stem_wordvec_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [105]:
from tensorflow.keras.callbacks import EarlyStopping

callback = [EarlyStopping(monitor='val_loss', min_delta=0, patience=2, mode='auto')]
history_stem_wordvec_lstm = model_stem_wordvec_lstm.fit(X_train, y_train, epochs=10, batch_size=32,validation_data=(X_test,y_test), callbacks=callback)

Epoch 1/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.3781 - loss: 2.1776 - val_accuracy: 0.4661 - val_loss: 1.7712
Epoch 2/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4265 - loss: 1.9283 - val_accuracy: 0.4874 - val_loss: 1.7069
Epoch 3/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4271 - loss: 1.9071 - val_accuracy: 0.4943 - val_loss: 1.6856
Epoch 4/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4271 - loss: 1.8841 - val_accuracy: 0.4766 - val_loss: 1.7042
Epoch 5/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4216 - loss: 1.9007 - val_accuracy: 0.4758 - val_loss: 1.6793
Epoch 6/10
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4296 - loss: 1.8683 - val_accuracy: 0.4795 - val_loss: 1.6758
Epoch 7/10
[1m1

In [106]:
y_pred = model_stem_wordvec_lstm.predict(X_test)
y_pred

[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


array([[0.15853055, 0.00144819, 0.03760834, ..., 0.08100306, 0.01150532,
        0.00449546],
       [0.00414513, 0.00344206, 0.01010975, ..., 0.00906015, 0.00060203,
        0.00050352],
       [0.00269915, 0.01384826, 0.00904027, ..., 0.0065204 , 0.00016134,
        0.00076548],
       ...,
       [0.00943357, 0.00837182, 0.0100858 , ..., 0.01022983, 0.00192649,
        0.00052781],
       [0.10129371, 0.00149402, 0.01660184, ..., 0.01496115, 0.03977218,
        0.01162436],
       [0.00493572, 0.00029041, 0.0065052 , ..., 0.00361356, 0.00251637,
        0.00105068]], dtype=float32)

In [107]:
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_test.argmax(axis=1) 

f1_stem_wordvec_lstm = round(f1_score(y_true_classes, y_pred_classes, average='weighted'), 3)
accuracy_stem_wordvec_lstm = round(accuracy_score(y_true_classes, y_pred_classes), 3)

print(f"Tikslumas: {accuracy_stem_wordvec_lstm}, F1: {f1_stem_wordvec_lstm}")

Tikslumas: 0.488, F1: 0.401


In [110]:
# Gauti rezultatai
import pandas as pd

# Create a dictionary to organize the data
data = {
    'Modelis': ['CNN', 'CNN', 'SVC', 'LSTM', 'CNN', 'SVC', 'LSTM'],
    'Apdorojimas': ['TF-IDF Lemavimas', 'TF-IDF Stemingas', 'TF-IDF Stemingas', 'TF-IDF Stemingas', 
                     'Word2Vec Stemingas', 'Word2Vec Stemingas', 'Word2Vec Stemingas'],
    'Tikslumas': [
        accuracy_f1_lem_tfidf_cnn,
        accuracy_stem_tfidf_cnn,
        accuracy_stem_tfidf_svc,
        accuracy_stem_tfidf_lstm,
        accuracy_stem_wordvec_cnn,
        accuracy_stem_wordvec_svc,
        accuracy_stem_wordvec_lstm
    ],
    'F1': [
        f1_lem_tfidf_cnn,
        f1_stem_tfidf_cnn,
        f1_stem_tfidf_svc,
        f1_stem_tfidf_lstm,
        f1_stem_wordvec_cnn,
        f1_stem_wordvec_svc,
        f1_stem_wordvec_lstm
    ]
}

df = pd.DataFrame(data)
df_sorted = df.sort_values(by='Tikslumas', ascending=False)
print(df_sorted)


  Modelis         Apdorojimas  Tikslumas     F1
5     SVC  Word2Vec Stemingas      0.568  0.522
4     CNN  Word2Vec Stemingas      0.512  0.427
6    LSTM  Word2Vec Stemingas      0.488  0.401
2     SVC    TF-IDF Stemingas      0.471  0.402
3    LSTM    TF-IDF Stemingas      0.470  0.397
0     CNN    TF-IDF Lemavimas      0.443  0.354
1     CNN    TF-IDF Stemingas      0.439  0.339


  Modelis     Apdorojimas       Tikslumas     F1     
1.  SVC     Word2Vec Stemingas    0.567     0.520 
2.  CNN     Word2Vec Stemingas    0.502     0.411
3.  LSTM    Word2Vec Stemingas    0.485     0.401
4.  SVC     TF-IDF Stemingas      0.471     0.402
5.  LSTM    TF-IDF Stemingas      0.470     0.401
6.  CNN     TF-IDF Lemavimas      0.442     0.350
7.  CNN     TF-IDF Stemingas      0.439     0.344

Parametru optimizavimo pabandymas, su geriausiu gautu variantu Stemingas + Word2Vec + SVC:

In [111]:

from sklearn.model_selection import train_test_split
import pandas as pd

y = stemming_df['GENRE']
X = X.reshape(X.shape[0], X.shape[2])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [112]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Modelio optimizavimas taikant gardeles paieska, taciau del dideliu duomenu kiekiu gali trukti kelias valandas.

param_grid = {
    'C': [0.1, 1, 10],              # Reguliarizacijos parametras
    'kernel': ['linear', 'rbf', 'poly'],  # Kernelio tipas
    'class_weight': [None, 'balanced']  # Klasiu svoriai
}

svc_model = SVC()
grid_search = GridSearchCV(estimator=svc_model, param_grid=param_grid, 
                           cv=5,  # Number of folds in cross-validation
                           scoring='accuracy',  # Metric to evaluate the performance
                           n_jobs=-1,  # Naudoti visus imanomus procesorius
                           verbose=1)  # Spausdinamos info kiekis

grid_search.fit(X_train, y_train)

print(f"Geriausi parametrai: {grid_search.best_params_}")
print(f"Geriausias rezultatas: {grid_search.best_score_}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Geriausi parametrai: {'C': 1, 'class_weight': None, 'kernel': 'rbf'}
Geriausias rezultatas: 0.5651751072892826


Kodel modelis liko toks pat/nepagerejo po optimizacijos? Nes geriausius parametrus, kuriuos jis surado - ir buvo nutyleti pagrindiniai SVC parametrai.

Išvados:
1. Geriausiai pasirodė Stemingo + Word2Vec + SVC derinys su 57% tikslumu ir didžiausiu F1 rodikliu. Tai atrodo ne tik kaip geras rezultatas, atsižvelgiant kad turėjome 27 klases tarp kurių prognozuota. Šis modelis ne tik pasiekė didžiausia tikslumą, bet ir geriausias F1 rodo, kad jis puikiai suderino klasių dydžių disbalansą.
2. Visais atvejais Word2Vec vektorizavimo būdas buvo ženkliai pranašesnis už tf-idf vektorius.
3. Stemingas ir Lemavimas suteikia panašius rezultatus, tad jei taupomas laikas, kartais neverta visų kombinacijų išbandyti. 


Rekomendacijos tobulinimui:
1. Daugiau privengti pasikartojančio kodo iškeliant jį į funkcijas.
2. Sugalvoti tinkamesnį užvadinimą train/test data pasikartojantiems kintamiesiems, nes labai maišosi.
3. Galima pabandyti pakeitinėti vektorizavimo technikų parametrus.
4. Galima išbandyti, gal kai kurie variantai geriau su lematizavimu veiktu.
5. Realaus pasaulio duomenims reiktu sukurti patogia funkcija/klase kuria galima butu juos taip pat apdoroti, kaip ir mokymosi duomenis (pvz.: steminguoti, vektorizuoti), prieš taikant modeli. 
6. Naudoti seed()/random_state pythone, tam kad išlaikyti tuos pačius modelio gautus rezultus (šiuo atveju per vėlai suprasta ir panaudota ne visur).
