# Machine Learning module

Imports

In [5]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

from data_access.insert.transit import update_transit_windows
from data_access.read.flux import get_m_a_trend_flux_for_star_in_sector
from data_access.read.transit import get_transits_for_sector

### Variables Selection

In [6]:
sector = 5
time_window = 0.3
step_size = 0.005
threshold = 0.00001

### Process

In [9]:
# Function to calculate the slope of linear regression in a window
def slope_in_window(data: pd.DataFrame, time_point, window=time_window, direction='left'):
    if direction == 'left':
        start_time = time_point - window
        end_time = time_point
    else:  # right direction
        start_time = time_point
        end_time = time_point + window

    # Select data in the window
    window_data = data[data.time.between(start_time, end_time)]
    if window_data.empty:
        return 0

    # Fit linear regression model
    X = window_data.time.values.reshape(-1, 1)
    y = window_data.m_a_trend_flux.values

    model = LinearRegression()
    model.fit(X, y)

    return model.coef_[0]

In [10]:
transits = get_transits_for_sector(sector)
for index, row in tqdm(transits.iterrows()):
    df = get_m_a_trend_flux_for_star_in_sector(sector, row.star_tic)
    minimum_flux_id = row.minimum_flux_id
    flux_at_minimum = df.at[minimum_flux_id, 'm_a_trend_flux']
    minimum_time = df[df.index==minimum_flux_id].time.values[0]

    # Initialize variables to store ingress and egress times
    ingress_time = None
    egress_time = None
    ingress_idx = None
    egress_idx = None

    # Scan for ingress and egress points
    current_time = minimum_time
    while current_time >= df.time.min() + time_window:
        current_time -= step_size
        if abs(slope_in_window(df, current_time, time_window, 'left')) < threshold:
            ingress_time = current_time - time_window/2
            ingress_idx = (df.time - ingress_time).abs().idxmin()
            break

    current_time = minimum_time
    while current_time <= df.time.max() - time_window:
        current_time += step_size
        if abs(slope_in_window(df, current_time, time_window, 'right')) < threshold:
            egress_time = current_time + time_window/2
            egress_idx = (df.time - egress_time).abs().idxmin()
            break

    if ingress_idx and egress_idx:
        new_transit_data = pd.Series({
            'minimum_flux_id': minimum_flux_id,
            'transit_start_flux_id': ingress_idx,
            'transit_end_flux_id': egress_idx,
            'width_of_biggest_neg_trend_to_left': minimum_time-ingress_time,
            'width_of_biggest_pos_trend_to_right': egress_time-minimum_time,
            'transit_depth_left': df.at[ingress_idx, 'm_a_trend_flux'] - flux_at_minimum,
            'transit_depth_right': df.at[egress_idx, 'm_a_trend_flux'] - flux_at_minimum,
        })
        update_transit_windows(new_transit_data)

145it [01:25,  1.69it/s]
