##Import

In [None]:
pip install pandas numpy tensorflow scikit-learn

In [None]:
pip install tensorflow_addons

In [None]:
pip install -U imbalanced-learn

In [None]:
pip install lightkurve

In [None]:
%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
import lightkurve as lk
import pandas as pd
import numpy as np
import os
import concurrent.futures
import tensorflow as tf

from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate, BatchNormalization, Activation, LeakyReLU, Multiply, Permute, Reshape, Lambda, RepeatVector
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, make_scorer, roc_auc_score, precision_recall_curve
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, SMOTE
from sklearn.preprocessing import RobustScaler
from collections import defaultdict

## Adatok

#### Liang Yu és munkatársai által közzétett adatok tensorflow record formátumúak, így először egy parszer segítségével átkonvertálom őket pandas dataframmé.

In [None]:
def parse_tfrecord(example):
    features = {
        'tic_id': tf.io.FixedLenFeature([], tf.int64),
        'row_id': tf.io.FixedLenFeature([], tf.int64),
        'Epoc': tf.io.FixedLenFeature([], tf.float32),
        'Sectors': tf.io.FixedLenFeature([], tf.int64),
        'local_view': tf.io.FixedLenFeature([61], tf.float32),
        'secondary_view': tf.io.FixedLenFeature([61], tf.float32),
        'Transit_Depth': tf.io.FixedLenFeature([], tf.float32),
        'global_view': tf.io.FixedLenFeature([201], tf.float32),
        'Duration': tf.io.FixedLenFeature([], tf.float32),
        'ccd': tf.io.FixedLenFeature([], tf.int64),
        'Period': tf.io.FixedLenFeature([], tf.float32),
        'depth_change': tf.io.FixedLenFeature([], tf.float32),
        'camera': tf.io.FixedLenFeature([], tf.int64),
        'Disposition': tf.io.FixedLenFeature([], tf.string),
    }

    return tf.io.parse_single_example(example, features)

def read_dataset(file_pattern):
    files = tf.io.gfile.glob(file_pattern)
    dataset = tf.data.TFRecordDataset(files)
    return dataset.map(parse_tfrecord)

def dataset_to_dataframe(dataset):
    records = []
    for record in dataset:
        parsed_record = {key: value.numpy() for key, value in record.items()}
        records.append(parsed_record)
    return pd.DataFrame(records)

train_pattern = ['train-0000{}-of-00008'.format(i) for i in range(8)]
test_pattern = 'test-00000-of-00001'
val_pattern = 'val-00000-of-00001'

train_dataset = read_dataset(train_pattern)
test_dataset = read_dataset(test_pattern)
val_dataset = read_dataset(val_pattern)

df = dataset_to_dataframe(train_dataset)
test_df = dataset_to_dataframe(test_dataset)
val_df = dataset_to_dataframe(val_dataset)

In [None]:
full_df = pd.concat([train_df, test_df, val_df])

In [None]:
full_df['Disposition'] = full_df['Disposition'].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

In [None]:
c = 0
train_df = pd.DataFrame(columns=['tic_id', 'global_view', 'local_view', 'secondary_view', 'depth_change', 'Disposition'])

for row in df['global_view']:
    disposition = df['Disposition'][c].numpy().decode('utf-8')

    data_to_append = pd.DataFrame({
        'tic_id': [int(df['tic_id'][c].numpy())],
        'global_view': [row.numpy().tolist()],
        'local_view': [df['local_view'][c].numpy().tolist()],
        'secondary_view': [df['secondary_view'][c].numpy().tolist()],
        'depth_change': [float(df['depth_change'][c].numpy())],
        'Disposition': [disposition]
    })

    train_df = pd.concat([train_df, data_to_append], ignore_index=True)
    c += 1

## A transit depth difference adat kiszámítása

#### Először az outlier értékeket szűröm ki a dolgozatomban említett módon.

In [None]:
def handle_outliers(df, column, lower_quantile=0.01, upper_quantile=0.99):
    """ Cap and floor the outliers based on quantiles """
    lower_bound = df[column].quantile(lower_quantile)
    upper_bound = df[column].quantile(upper_quantile)
    df[column] = np.clip(df[column], lower_bound, upper_bound)
    return df

train_df = handle_outliers(train_df, 'depth_change')
test_df = handle_outliers(test_df, 'depth_change')
val_df = handle_outliers(val_df, 'depth_change')

# Robust Scaler
scaler = RobustScaler()
train_df['transit_depth_difference'] = scaler.fit_transform(train_df[['depth_change']])
test_df['transit_depth_difference'] = scaler.transform(test_df[['depth_change']])
val_df['transit_depth_difference'] = scaler.transform(val_df[['depth_change']])

print("A normalizált minta a Robust Saler alkalmazása után:")
print(train_df[['depth_change', 'transit_depth_difference']].head())

In [None]:
import pandas as pd
import numpy as np

mean_depth_change = train_df['depth_change'].mean()
std_depth_change = train_df['depth_change'].std()

print("Mélységcsökkenés átlaga:", mean_depth_change)
print("Mélységcsökkenés szórása:", std_depth_change)

#normalizálom a mélységcsökkenést (depth_change) úgy, hogy kivonom a tanítóhalmaz átlagát, majd elosztom a tanítóhalmaz szórásával
def normalize_depth_change(df, mean, std):
    if std == 0:
        print("A szórás értéke 0, így a kapott értékek egyenlőek lesznek.")
    else:
        df['transit_depth_difference'] = (df['depth_change'] - mean) / std
    return df

# normalizálom a három adathalmazt a kapott statisztikai értékek alapján
train_df = normalize_depth_change(train_df, mean_depth_change, std_depth_change)
test_df = normalize_depth_change(test_df, mean_depth_change, std_depth_change)
val_df = normalize_depth_change(val_df, mean_depth_change, std_depth_change)


print("A mélységcsökkenés értékének terjedelme a tanítóhalmazban:", train_df['depth_change'].min(), " - ", train_df['depth_change'].max())
print("A mélységcsökkenés eloszlása a tanítóhalmazban:", train_df['depth_change'].var())



## Beállítom, hogy triage vagy vetting feladatra tanítom a modellt.






In [None]:
mode = 'vetting' # vetting / triage

In [None]:
if mode == 'vetting':
  train_df.loc[train_df['Disposition'] == 'EB', 'Disposition'] = 'J'
  test_df.loc[test_df['Disposition'] == 'EB', 'Disposition'] = 'J'
  val_df.loc[val_df['Disposition'] == 'EB', 'Disposition'] = 'J'
elif mode == 'triage':
  train_df.loc[train_df['Disposition'] == 'EB', 'Disposition'] = 'TE'  # TE => "transit event", egy közös címke az exobolygójelöltek és a fedési kettőscsillag-rendszerek számára
  test_df.loc[test_df['Disposition'] == 'EB', 'Disposition'] = 'TE'
  val_df.loc[val_df['Disposition'] == 'EB', 'Disposition'] = 'TE'
  train_df.loc[train_df['Disposition'] == 'PC', 'Disposition'] = 'TE'
  test_df.loc[test_df['Disposition'] == 'PC', 'Disposition'] = 'TE'
  val_df.loc[val_df['Disposition'] == 'PC', 'Disposition'] = 'TE'

## A szűrő eljárás a paraméterek és metaparaméterek gyorsabb beállításához.

In [None]:
import pandas as pd
import numpy as np

# szűrő eljárás a lokális és másodlagos nézetekhez
def filter(flux_array):

    classifications = []

    first_segment = flux_array[:20]
    middle_segment = flux_array[20:41]
    third_segment = flux_array[41:]

    first_avg = np.mean(first_segment)
    middle_avg = np.mean(middle_segment)
    third_avg = np.mean(third_segment)

    if first_avg < 0.18 and third_avg < 0.18 and -1 < middle_avg < -0.3:
        classifications.append('PC')
    else:
        classifications.append('Not PC')

    return np.array(classifications)

# szűrő eljárás a globális nézetekhez
def filter_global(flux_array):

    classifications = []

    first_segment = flux_array[:98]
    middle_segment = flux_array[98:102]
    third_segment = flux_array[102:]

    first_avg = np.mean(first_segment)
    middle_avg = np.mean(middle_segment)
    third_avg = np.mean(third_segment)

    if first_avg < 0.05 and third_avg < 0.05 and -1 < middle_avg < -0.2:
        classifications.append('PC')
    else:
        classifications.append('Not PC')

    return np.array(classifications)


c = 0
pc_count = 0
eb_count = 0
j_count = 0
indices = []


for i in val_df["local_view"]:
  smtgh = filter(i)
  if smtgh == "PC":
    smtgh = filter(val_df["secondary_view"][c])
    if smtgh != "PC":
      smtgh = filter_global(val_df["global_view"][c])
      if smtgh == "PC":
        indices.append(c)
        if val_df["Disposition"][c] == "EB":
          eb_count += 1
        if val_df["Disposition"][c] == "J":
          j_count += 1
        if val_df["Disposition"][c] == "PC":
          pc_count += 1

  c += 1

print(f"PC szám: {pc_count}")
print(f"EB szám: {eb_count}")
print(f"J szám: {j_count}")

#Tanítás

In [None]:
# A kiegyensúlyozott batch generálás
class BalancedBatchGenerator:
    def __init__(self, X, y, batch_size=32, classes=None):
        self.X = [X[i] for i in range(len(X))]
        self.y = y
        self.batch_size = batch_size
        self.classes = np.unique(y.argmax(axis=1)) if classes is None else classes
        self.class_indices = {cls: np.where(y.argmax(axis=1) == cls)[0] for cls in self.classes}

    def generate(self):
        while True:
            batch_indices = []
            per_class = self.batch_size // len(self.classes)

            for cls in self.classes:
                choices = np.random.choice(self.class_indices[cls], per_class, replace=True)
                batch_indices.extend(choices)

            if len(batch_indices) < self.batch_size:
                remainder = self.batch_size - len(batch_indices)
                additional_indices = np.random.choice(np.concatenate(list(self.class_indices.values())), remainder, replace=True)
                batch_indices.extend(additional_indices)

            np.random.shuffle(batch_indices)
            yield [self.X[i][batch_indices] for i in range(len(self.X))], self.y[batch_indices]


def tensor_to_value(tensor):
    return tensor.numpy().decode('utf-8') if isinstance(tensor, tf.Tensor) else tensor

for df in [train_df, test_df, val_df]:
    df['Disposition'] = df['Disposition'].apply(tensor_to_value)

# Enkódolom a címkéket (One-hot encoding)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['Disposition'])
y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(label_encoder.transform(test_df['Disposition']))
y_val_encoded = to_categorical(label_encoder.transform(val_df['Disposition']))


# Adataugmentációk:
def warp_light_curve(light_curve, warp_factor=0.1):
    length = len(light_curve)
    warp_size = int(warp_factor * length)
    start = np.random.randint(0, length - warp_size)
    end = start + warp_size

    # Kiválasztom az elnyújtani kívánt szegmenst
    segment = light_curve[start:end]

    # Meghatározom véletlenszerűen a megnyújtás mértékét
    if np.random.rand() > 0.5:
        # Kompresszió
        new_size = np.random.randint(low=warp_size // 2, high=warp_size)
    else:
        # Nyújtás
        new_size = np.random.randint(low=warp_size, high=min(length, warp_size * 2))
        if start + new_size > length:
            new_size = length - start  # Az új hossz nem haladhatja meg a fénygörbe eredeti hosszát

    # Interpoláció
    warped_segment = np.interp(
        np.linspace(0, warp_size, num=new_size, endpoint=False),
        np.arange(warp_size),
        segment
    )

    # Új fénygörbe létrehozása
    new_light_curve = np.concatenate([
        light_curve[:start],
        warped_segment,
        light_curve[start + warp_size:]
    ])

    # Fénygörbe hosszának igazítása
    if len(new_light_curve) > length:
        new_light_curve = new_light_curve[:length]
    elif len(new_light_curve) < length:
        new_light_curve = np.pad(new_light_curve, (0, length - len(new_light_curve)), 'constant')

    return new_light_curve


# Készítettem két másik adataugmentációt is azonban ezekkel egyenlőre nem sikerült jobb eredményeket elérnem, így a kiértékelésnél nem használtam
def jitter_light_curve(light_curve, noise_level=0.02):
    # Gauss zajjal történő adatagmentáció
    noise = np.random.normal(0, noise_level, size=len(light_curve))
    return light_curve + noise

def randomly_shift_points(series, num_points=61, max_shift=0.2):
    # Véletlenszerűen választott adatpontok függőleges irányban történő eltolása. A mérésekkor fellépő zajt szeretném szimulálni.
    modified_series = np.copy(series)

    indices_to_shift = np.random.choice(len(series), size=num_points, replace=False)

    shifts = np.random.uniform(-max_shift, max_shift, size=num_points)

    for idx, shift in zip(indices_to_shift, shifts):
        modified_series[idx] += shift

    return modified_series


def augment_light_curves(df, features, warp_factor=0.08):
    augmented_rows = []

    for _, row in df.iterrows():
        # Liang Yu és munkatársai által használt adataugmentáció, vízszintes tükrözés
        flip_augmented_row = row.copy()
        for feature in features:
            if np.random.rand() < 0.5:
                flip_augmented_row[feature] = np.flip(row[feature], axis=0).tolist()
        augmented_rows.append(flip_augmented_row)

        # A dolgozatomban emltített másik adataugmentációm, a szeletek nyújtása
        warp_augmented_row = row.copy()
        for feature in features:
          if np.random.rand() < 0.5:
            light_curve = np.array(row[feature])
            warped_light_curve = warp_light_curve(light_curve, warp_factor=warp_factor)
            warp_augmented_row[feature] = warped_light_curve.tolist()
        augmented_rows.append(warp_augmented_row)


    augmented_df = pd.DataFrame(augmented_rows)
    return pd.concat([df, augmented_df], ignore_index=True)

# A globális, lokális és másodlagos nézeteket is augmentálom
features_to_augment = ['global_view', 'local_view', 'secondary_view']

# Adataugmentáció alkalmazása a tanítóhalmazra
augmented_train_df = augment_light_curves(train_df, features_to_augment)
augmented_train_df = augmented_train_df.sample(frac=1).reset_index(drop=True)
y_train_encoded = to_categorical(label_encoder.transform(augmented_train_df['Disposition']))


# Input adatok előkészítése
def prepare_data(df):
    X_global = np.array(df['global_view'].tolist()).reshape((-1, 201, 1))
    X_local = np.array(df['local_view'].tolist()).reshape((-1, 61, 1))
    X_secondary = np.array(df['secondary_view'].tolist()).reshape((-1, 61, 1))
    X_depth_change = np.array((df['transit_depth_difference'] * 4).tolist()).reshape((-1, 1)) # a dolgozatban említett módon a "transit_depth_difference" értéket néggyel szorzom
    return X_global, X_local, X_secondary, X_depth_change

X_global_train, X_local_train, X_secondary_train, X_depth_change_train = prepare_data(augmented_train_df)
X_global_test, X_local_test, X_secondary_test, X_depth_change_test = prepare_data(test_df)
X_global_val, X_local_val, X_secondary_val, X_depth_change_val = prepare_data(val_df)


# Konvolúciós neurális hálózat

# Az első CNN ágam, amelyet a lokális és másodlagos nézeteknél használok
def create_cnn_branch(input_shape):
    input_layer = Input(shape=input_shape)
    # Első konvolúciós réteg
    x = Conv1D(16, 5, padding='same', kernel_regularizer=l2(0.001))(input_layer)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.2)(x)
    # Második konvolúciós réteg
    x = Conv1D(32, 5, padding='same', kernel_regularizer=l2(0.001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.3)(x)
    # Harmadik konvolúciós réteg
    x = Conv1D(64, 5, padding='same', kernel_regularizer=l2(0.001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.4)(x)
    x = Flatten()(x)
    return input_layer, x

# A másik, mélyebb ágam a globális nézethez
def create_cnn_branch_deep(input_shape):
    input_layer = Input(shape=input_shape)
    # Első konvolúciós réteg
    x = Conv1D(32, 5, padding='same', kernel_regularizer=l2(0.001))(input_layer)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.3)(x)
    # Második konvolúciós réteg
    x = Conv1D(128, 5, padding='same', kernel_regularizer=l2(0.001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.4)(x)
    # Harmadik konvolúciós réteg
    x = Conv1D(128, 5, padding='same', kernel_regularizer=l2(0.001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.4)(x)
    # Negyedik konvolúciós réteg
    x = Conv1D(256, 5, padding='same', kernel_regularizer=l2(0.001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.5)(x)
    x = Flatten()(x)
    return input_layer, x

# A tanulási ráta szabályozására használt egyszerű scheduler
def scheduler(epoch, lr):
  if epoch < 2:
      return lr
  else:
      return lr * tf.math.exp(-0.1)


local_input, local_features = create_cnn_branch((61, 1))
global_input, global_features = create_cnn_branch_deep((201, 1))
secondary_input, secondary_features = create_cnn_branch((61, 1))

depth_change_input = Input(shape=(1,))

# Egyesítem a jellemzőket, itt kerül elő a "transit_depth_difference" is
combined_features = concatenate([local_features, global_features, secondary_features, depth_change_input])

# Teljesen összekötött rétegek
x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(combined_features)
x = Dropout(0.4)(x)
x = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(x)
x = Dropout(0.5)(x)
output = Dense(y_train_encoded.shape[1], activation='softmax')(x)


# A modell
model = Model(inputs=[local_input, global_input, secondary_input, depth_change_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Korai megállás
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, mode='min', verbose=1),
    LearningRateScheduler(scheduler, verbose=1)
]

# A kiegyensúlyozatlanság miatt bevezetett súlyok, azonban végül nem segítettek a tanítás során így nem használtam. Fenntartom a lehetőségét,
# hogy további finomhangolások után segíthetnek a modell eredményességében, így egyenlőre megtartottam a kódrészletet.
y_train_labels = np.argmax(y_train_encoded, axis=1)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# Kiegyensúlyozott batch generálás
batch_size = 75
generator = BalancedBatchGenerator([X_local_train, X_global_train, X_secondary_train, X_depth_change_train], y_train_encoded, batch_size=batch_size)
validation_data = ([X_local_val, X_global_val, X_secondary_val, X_depth_change_val], y_val_encoded)

# Modell tanítása
history = model.fit(
    generator.generate(),
    steps_per_epoch=len(y_train_encoded) // batch_size,
    validation_data=validation_data,
    epochs=30,
    callbacks=callbacks
)

predictions = model.predict([X_local_test, X_global_test, X_secondary_test, X_depth_change_test])
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test_encoded, axis=1)

# Modell kiértékelése
results = model.evaluate([X_local_test, X_global_test, X_secondary_test, X_depth_change_test], y_test_encoded)
print(f'Test Accuracy: {results[1]*100:.2f}%')

report = classification_report(true_classes, predicted_classes, target_names=label_encoder.classes_)
print(report)

## Precision-Recall görbe

### Egy modell kiértékeléséhez

In [None]:
y_test = np.argmax(y_test_encoded, axis=1)
class_of_interest = 1  # itt állíthatjuk be, hogy melyik osztályra szeretnénk kirajzolni a precision-recall görbét (jelen esetben a 0 a J osztályt, az 1 pedig a PC osztályt reprezentálja)
y_test_binary = (y_test == class_of_interest).astype(int)

# Valószínűségek összegyűjtése.
probabilities = model.predict([X_local_test, X_global_test, X_secondary_test, X_depth_change_test])
class_probabilities = probabilities[:, class_of_interest]

precision, recall, thresholds = precision_recall_curve(y_test_binary, class_probabilities)

# Görbe kirajzolása.
plt.figure(figsize=(7, 6))
plt.title('Precision-Recall görbe - PC osztály'.format(label_encoder.classes_[class_of_interest]))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.grid(True)
plt.show()

### Több modell átlagolt kiértékeléséhez

In [None]:
def predict_with_models(model_files, X_inputs):
    predictions = []
    for model_file in model_files:
        model = load_model(model_file)
        predictions.append(model.predict(X_inputs))

    # Átlagoljuk a predikciókat.
    predictions = np.array(predictions)
    mean_predictions = np.mean(predictions, axis=0)
    return mean_predictions

def plot_precision_recall_curve(y_test_encoded, mean_predictions, class_of_interest, class_labels):
    y_test = np.argmax(y_test_encoded, axis=1)
    y_test_binary = (y_test == class_of_interest).astype(int)

    class_probabilities = mean_predictions[:, class_of_interest]

    precision, recall, thresholds = precision_recall_curve(y_test_binary, class_probabilities)

    # Precision-recall görbe kirajolása
    plt.figure(figsize=(7, 6))
    plt.title('Precision-Recall görbe - PC osztály'.format(class_labels[class_of_interest]))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.grid(True)
    plt.show()

# A modellek fájlneveinek listája
model_files = [f'model_ogval_{i}.h5' for i in range(1, 8)]

mean_predictions = predict_with_models(model_files, [X_local_test, X_global_test, X_secondary_test, X_depth_change_test])
plot_precision_recall_curve(y_test_encoded, mean_predictions, 1, label_encoder.classes_)

In [None]:
# modell mentése
model.save('model_1.h5')

In [None]:
model_path = 'model_1.h5'
# Korábbi modell betöltése
model = load_model(model_path)

## Keresztvalidáció

In [None]:
class BalancedBatchGenerator:
    def __init__(self, X, y, batch_size=32, classes=None):
        self.X = [X[i] for i in range(len(X))]
        self.y = y
        self.batch_size = batch_size
        self.classes = np.unique(y.argmax(axis=1)) if classes is None else classes
        self.class_indices = {cls: np.where(y.argmax(axis=1) == cls)[0] for cls in self.classes}

    def generate(self):
        while True:
            batch_indices = []
            per_class = self.batch_size // len(self.classes)

            for cls in self.classes:
                choices = np.random.choice(self.class_indices[cls], per_class, replace=True)
                batch_indices.extend(choices)

            if len(batch_indices) < self.batch_size:
                remainder = self.batch_size - len(batch_indices)
                additional_indices = np.random.choice(np.concatenate(list(self.class_indices.values())), remainder, replace=True)
                batch_indices.extend(additional_indices)

            np.random.shuffle(batch_indices)
            yield [self.X[i][batch_indices] for i in range(len(self.X))], self.y[batch_indices]

# Tanító- és validációshalmaz egyesítése
all_train_df = pd.concat([train_df, val_df], ignore_index=True)

# Címkénk enkódolása (one-hot)
label_encoder = LabelEncoder()
all_labels = label_encoder.fit_transform(all_train_df['Disposition'])
all_labels_encoded = to_categorical(all_labels)

def create_model():
    local_input, local_features = create_cnn_branch((61, 1))
    global_input, global_features = create_cnn_branch_deep((201, 1))
    secondary_input, secondary_features = create_cnn_branch((61, 1))
    depth_change_input = Input(shape=(1,))

    combined_features = concatenate([local_features, global_features, secondary_features, depth_change_input])

    x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(combined_features)
    x = Dropout(0.4)(x)
    x = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.5)(x)
    output = Dense(Y_all.shape[1], activation='softmax')(x)

    model = Model(inputs=[local_input, global_input, secondary_input, depth_change_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

def augment_light_curves(df, features, warp_factor=0.08):
    augmented_rows = []
    for _, row in df.iterrows():
        flip_augmented_row = row.copy()
        for feature in features:
            if np.random.rand() < 0.5:
                flip_augmented_row[feature] = np.flip(row[feature], axis=0).tolist()
        augmented_rows.append(flip_augmented_row)

        warp_augmented_row = row.copy()
        for feature in features:
          if np.random.rand() < 0.5:
            light_curve = np.array(row[feature])
            warped_light_curve = warp_light_curve(light_curve, warp_factor=warp_factor)
            warp_augmented_row[feature] = warped_light_curve.tolist()
        augmented_rows.append(warp_augmented_row)

    augmented_df = pd.DataFrame(augmented_rows)
    return pd.concat([df, augmented_df], ignore_index=True)


def prepare_data(df):
    X_global = np.array(df['global_view'].tolist()).reshape((-1, 201, 1))
    X_local = np.array(df['local_view'].tolist()).reshape((-1, 61, 1))
    X_secondary = np.array(df['secondary_view'].tolist()).reshape((-1, 61, 1))
    X_depth_change = np.array((df['transit_depth_difference'] * 4).tolist()).reshape((-1, 1))
    return [X_local, X_global, X_secondary, X_depth_change]

features_to_augment = ['global_view', 'local_view', 'secondary_view']
augmented_train_df = augment_light_curves(all_train_df, features_to_augment)
X_all = prepare_data(augmented_train_df)
Y_all = to_categorical(label_encoder.transform(augmented_train_df['Disposition']))

X_test = prepare_data(test_df)
Y_test = to_categorical(label_encoder.transform(test_df['Disposition']))

kf = KFold(n_splits=5, shuffle=True, random_state=42)


for fold_no, (train_index, val_index) in enumerate(kf.split(X_all[0])):
    X_train, X_val = [x[train_index] for x in X_all], [x[val_index] for x in X_all]
    y_train, y_val = Y_all[train_index], Y_all[val_index]

    assert max(train_index) < len(Y_all), "Train index out of bounds"
    assert max(val_index) < len(Y_all), "Validation index out of bounds"

    callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, mode='min', verbose=1),
    LearningRateScheduler(scheduler, verbose=1)
    ]

    model = create_model()
    print(f'Training fold {fold_no + 1}...')
    batch_size = 75
    generator = BalancedBatchGenerator([X_local_train, X_global_train, X_secondary_train, X_depth_change_train], y_train_encoded, batch_size=batch_size)
    validation_data = ([X_local_val, X_global_val, X_secondary_val, X_depth_change_val], y_val_encoded)

    history = model.fit(
        generator.generate(),
        steps_per_epoch=len(y_train_encoded) // batch_size,
        validation_data=validation_data,
        epochs=30,
        callbacks=callbacks
    )

    # Mentsük el a modellt
    model.save(f'model_fold_{fold_no+1}.h5')


results = model.evaluate(X_test, Y_test)
print(f'Test Accuracy: {results[1]*100:.2f}%')

predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(Y_test, axis=1)
report = classification_report(true_classes, predicted_classes, target_names=label_encoder.classes_)
print(report)


## A dolgozat 8.1.-es pontjában említett tanítóhalmaz és teszthalmaz közötti átfedés.

In [None]:
train_df['local_view_tuple'] = train_df['local_view'].apply(tuple)
test_df['local_view_tuple'] = test_df['local_view'].apply(tuple)

train_views_set = set(train_df['local_view_tuple'])

count = test_df['local_view_tuple'].apply(lambda x: x in train_views_set).sum()

print(f"Átfedések száma: {count}")

## GUI

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt


# Convert byte literals to string in the Disposition column
full_df['Disposition'] = full_df['Disposition'].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

# Create widgets
tic_id_input = widgets.Text(
    value='',
    description='TIC ID:',
    disabled=False
)

view_dropdown = widgets.Dropdown(
    options=['global_view', 'local_view', 'secondary_view'],
    value='global_view',
    description='View:',
    disabled=False,
)

submit_button = widgets.Button(
    description='Plot Data'
)

output = widgets.Output()

# Define button click event handler
def on_button_clicked(b):
    with output:
        clear_output(wait=True)
        tic_id = tic_id_input.value.strip()  # Strip any whitespace

        # Ensure the input matches the DataFrame's 'tic_id' type
        if full_df['tic_id'].dtype == 'int64':
            try:
                tic_id = int(tic_id)  # Convert input to integer if necessary
            except ValueError:
                print("Invalid TIC ID format. Please input a valid number.")
                return
        view = view_dropdown.value
        # Filter data by tic_id
        data = full_df[full_df['tic_id'] == tic_id]

        if not data.empty:
            data_view = data[view].iloc[0]  # Assume this extracts the correct data format for plotting

            if isinstance(data_view, (list, np.ndarray)):  # Check if data is in list or array format
                plt.figure(figsize=(10, 6))
                plt.plot(data_view)  # Plot directly if already in correct format
                plt.title(f"{view.capitalize()} for TIC ID {tic_id} - Disposition: {data['Disposition'].iloc[0]}")
                plt.show()
            else:
                print("Data format not suitable for plotting. Please check data structure.")
        else:
            print("No data found for this TIC ID")



submit_button.on_click(on_button_clicked)

# Display widgets
display(tic_id_input, view_dropdown, submit_button, output)


Text(value='', description='TIC ID:')

Dropdown(description='View:', options=('global_view', 'local_view', 'secondary_view'), value='global_view')

Button(description='Plot Data', style=ButtonStyle())

Output()