# Importujemy biblioteki

In [2]:
# Importujemy biblioteki
import pandas as pd
import numpy as np
import os
import ast
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

Reszta kodu dla proporcji 70_30

In [None]:
# Ścieżki do danych
train_texts_path = '../data/70_30/train_texts_w2v.csv'
train_labels_path = '../data/70_30/train_labels.csv'

# Wczytujemy dane
X_train_raw = pd.read_csv(train_texts_path, index_col=0)
y_train = pd.read_csv(train_labels_path, index_col=0).squeeze()

print("Dane treningowe wczytane poprawnie.")
print(f"X_train_raw shape: {X_train_raw.shape}")
print(f"y_train shape: {y_train.shape}")

print("Rozkład klas w zbiorze treningowym (przed mapowaniem):")
print(y_train.value_counts(normalize=True))

# Wyrównujemy indeksy
X_train_raw, y_train = X_train_raw.align(y_train, join='inner', axis=0)

# Mapujemy gwiazdki na klasy sentymentu
def map_sentiment(star_rating):
    if star_rating in [4, 5]:
        return 'Pozytywna'
    elif star_rating == 3:
        return 'Neutralna'
    else:
        return 'Negatywna'

y_train = y_train.map(map_sentiment)

print("Rozkład klas w zbiorze treningowym (po mapowaniu):")
print(y_train.value_counts(normalize=True))

# Parsujemy wektory Word2Vec
X_train = X_train_raw['text'].apply(ast.literal_eval).apply(np.array)
X_train = np.stack(X_train.values)

print(f"Kształt danych wejściowych po konwersji: {X_train.shape}")

# Kodujemy etykiety
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_train_categorical = to_categorical(y_train_encoded)

print(f"Zakodowane etykiety: {list(label_encoder.classes_)}")

# Parametry wejściowe
input_shape = (X_train.shape[1], X_train.shape[2])

# Budujemy model LSTM
model = Sequential()
model.add(Masking(mask_value=0., input_shape=input_shape))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_train_categorical.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

# Trenowanie modelu
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

model.fit(X_train, y_train_categorical, epochs=10, batch_size=128, callbacks=[early_stopping])

print("Model został wytrenowany.")

# Zapisujemy model do pliku .h5
model_dir = '../models'
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, 'w2v_LSTM_70_30_PNN.h5')
model.save(model_path)

print(f"Model zapisano w: {model_path}")

# Zapisujemy label encoder do późniejszego dekodowania
import pickle
with open(os.path.join(model_dir, 'w2v_LSTM_70_30_PNN_label_encoder.pkl'), 'wb') as le_file:
    pickle.dump(label_encoder, le_file)

print("LabelEncoder został zapisany.")


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.