In [None]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer
from sklearn.metrics import accuracy_score


INPUT_ROOT = '../input'
directories = os.listdir(INPUT_ROOT)
if len(directories) > 0:
    BASE_PATH = os.path.join(INPUT_ROOT, directories[0]) 
else:
    BASE_PATH = '../input'

if os.path.exists(os.path.join(BASE_PATH, 'dataset')):
    BASE_PATH = os.path.join(BASE_PATH, 'dataset')

TRAIN_IMG_DIR = os.path.join(BASE_PATH, 'train')
TEST_IMG_DIR = os.path.join(BASE_PATH, 'test')


possible_labels = ['train.csv', 'labels.csv', 'train_labels.csv']
LABELS_FILE = None
for f in possible_labels:
    path = os.path.join(BASE_PATH, f)
    if os.path.exists(path):
        LABELS_FILE = path
        break

N_FOLDS = 10
BATCH_SIZE = 128
CONFIDENCE_THRESHOLD = 0.95  #


def extract_features(dataframe, folder):
    features = []
    labels = []
    c = 9
    
    for idx, row in dataframe.iterrows():
        try:
            path = os.path.join(folder, row['filename'])
            img = np.load(path) 
            
          
            center = img[c, c, :]
            
            
            d1 = np.diff(center)
            d2 = np.diff(d1)
            
            
            neighbors = img[c-1:c+2, c-1:c+2, :]
            n_mean = np.mean(neighbors, axis=(0, 1))
            n_std = np.std(neighbors, axis=(0, 1))
            
           
            skew = np.mean((center - np.mean(center))**3)
            kurt = np.mean((center - np.mean(center))**4)
            area = np.trapz(center)
            
            
            ratio_ir_blue = (center[-1] - center[0]) / (center[-1] + center[0] + 1e-6)
            
            feat = np.concatenate([
                center, d1, d2, n_mean, n_std, [skew, kurt, area, ratio_ir_blue]
            ])
            
            features.append(feat)
            if 'label' in row:
                labels.append(row['label'])
            else:
                labels.append(0)
        except:
            pass
    return np.array(features), np.array(labels)

print(">>> [1/5] Încărcare și procesare date...")
df = pd.read_csv(LABELS_FILE)
df_filtered = df[~df['label'].isin([4, 5])].copy()

X_train_orig, y_train_orig = extract_features(df_filtered, TRAIN_IMG_DIR)

test_files = sorted([f for f in os.listdir(TEST_IMG_DIR) if f.endswith('.npy')])
df_test = pd.DataFrame({'filename': test_files})
X_test, _ = extract_features(df_test, TEST_IMG_DIR)

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_orig)
NUM_CLASSES = len(le.classes_)


print(">>> [2/5] Scalare date (PowerTransformer)...")
scaler = PowerTransformer(method='yeo-johnson')

X_train_scaled = scaler.fit_transform(X_train_orig)
X_test_scaled = scaler.transform(X_test)


def train_lgbm(X, y, X_test_curr):
    """ Antrenează LightGBM și returnează probabilitățile pe Test """
    probs = np.zeros((len(X_test_curr), NUM_CLASSES))
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    
    for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
        X_t, X_v = X[t_idx], X[v_idx]
        y_t, y_v = y[t_idx], y[v_idx]
        
        clf = lgb.LGBMClassifier(
            n_estimators=1500, learning_rate=0.03, num_leaves=31,
            subsample=0.8, colsample_bytree=0.7, class_weight='balanced',
            device='gpu', verbose=-1
        )
        clf.fit(X_t, y_t, eval_set=[(X_v, y_v)])
        probs += clf.predict_proba(X_test_curr)
        
    return probs / N_FOLDS

def build_mlp(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(512, kernel_initializer='he_normal'), BatchNormalization(), Activation('relu'), Dropout(0.4),
        Dense(256, kernel_initializer='he_normal'), BatchNormalization(), Activation('relu'), Dropout(0.3),
        Dense(128, kernel_initializer='he_normal'), BatchNormalization(), Activation('relu'), Dropout(0.2),
        Dense(NUM_CLASSES, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def train_mlp(X, y, X_test_curr):
    """ Antrenează MLP și returnează probabilitățile pe Test """
    probs = np.zeros((len(X_test_curr), NUM_CLASSES))
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    
    for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
        X_t, X_v = X[t_idx], X[v_idx]
        y_t, y_v = y[t_idx], y[v_idx]
        
        model = build_mlp(X.shape[1])
        callbacks = [
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, verbose=0),
            EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True)
        ]
        
        model.fit(X_t, y_t, epochs=80, batch_size=BATCH_SIZE, 
                  validation_data=(X_v, y_v), callbacks=callbacks, verbose=0)
        
        probs += model.predict(X_test_curr, batch_size=BATCH_SIZE, verbose=0)
        
    return probs / N_FOLDS


print("\n>>> [3/5] ETAPA 1: Antrenare pe datele originale...")
p_lgbm_1 = train_lgbm(X_train_orig, y_train_encoded, X_test) 
p_mlp_1 = train_mlp(X_train_scaled, y_train_encoded, X_test_scaled) 


preds_stage1 = (0.4 * p_lgbm_1) + (0.6 * p_mlp_1)


print(f"\n>>> [4/5] ETAPA 2: Pseudo-Labeling (Prag: {CONFIDENCE_THRESHOLD*100}%)")


max_p = np.max(preds_stage1, axis=1)
pseudo_labels = np.argmax(preds_stage1, axis=1)

high_conf_idx = np.where(max_p >= CONFIDENCE_THRESHOLD)[0]
print(f" -> Din {len(X_test)} exemple test, adăugăm {len(high_conf_idx)} în Train.")


X_pseudo_orig = X_test[high_conf_idx]
X_pseudo_scaled = X_test_scaled[high_conf_idx]
y_pseudo = pseudo_labels[high_conf_idx]


X_train_aug_orig = np.concatenate([X_train_orig, X_pseudo_orig], axis=0)
X_train_aug_scaled = np.concatenate([X_train_scaled, X_pseudo_scaled], axis=0)
y_train_aug = np.concatenate([y_train_encoded, y_pseudo], axis=0)


print(" -> Re-antrenare LightGBM pe setul extins...")
p_lgbm_2 = train_lgbm(X_train_aug_orig, y_train_aug, X_test)

print(" -> Re-antrenare MLP pe setul extins...")
p_mlp_2 = train_mlp(X_train_aug_scaled, y_train_aug, X_test_scaled)


print("\n>>> [5/5] Finalizare...")

final_probs = (0.4 * p_lgbm_2) + (0.6 * p_mlp_2)

final_preds_idx = np.argmax(final_probs, axis=1)
final_preds_label = le.inverse_transform(final_preds_idx)


unique, counts = np.unique(final_preds_label, return_counts=True)
print(f"Distribuția Finală: {dict(zip(unique, counts))}")

submission = pd.DataFrame({'filename': test_files, 'label': final_preds_label})
submission.to_csv('submission_pseudo_labeling_85.csv', index=False)
print(">>> GATA! 'submission_pseudo_labeling_85.csv' este gata.")