In [64]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import pickle

def handle_missing_data(data, method='ignore', threshold=0.1):
    """Handle missing data in a given time series stretch.
    Arguments:
    - data: pandas DataFrame with glucose values.
    - method: strategy for handling missing data ('ignore', 'linear', 'polynomial', 'knn').
    - threshold: fraction of missing data points to consider a stretch invalid.
    
    Returns:
    - data: cleaned data based on the selected method.
    """
    missing_fraction = data.isna().mean().mean()
    if missing_fraction > threshold:
        return None

    if method == 'linear':
        return data.interpolate(method='linear')

    if method == 'polynomial':
        return data.interpolate(method='polynomial', order=2)

    if method == 'knn':
        if len(data.dropna()) > 0:
            imputer = KNNImputer(n_neighbors=3)
            return pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
        else:
            # "Insufficient data for KNN imputation, skipping..."
            return None
    return data.dropna()

def extract_base_data():
    insulin_file = './InsulinData.csv'
    cgm_file = './CGMData.csv'
    insulin_file_two = './Insulin_patient2.csv'
    cgm_file_two = './CGM_patient2.csv'
    insulin_data_one = pd.read_csv(insulin_file, low_memory=False)
    cgm_data_one = pd.read_csv(cgm_file, low_memory=False)
    insulin_data_two = pd.read_csv(insulin_file_two, low_memory=False)
    cgm_data_two = pd.read_csv(cgm_file_two, low_memory=False)

    insulin_data = pd.concat([insulin_data_one, insulin_data_two], ignore_index=True)
    cgm_data = pd.concat([cgm_data_one, cgm_data_two], ignore_index=True)
    
    cgm_data['Timestamp'] = pd.to_datetime(cgm_data['Date'] + ' ' + cgm_data['Time'])
    insulin_data['Timestamp'] = pd.to_datetime(insulin_data['Date'] + ' ' + insulin_data['Time'])
    return insulin_data, cgm_data 

def extract_meal_data(insulin_data, cgm_data, missing_data_method='ignore', threshold=0.1):
    # COLUMNS
    TIME = 'Timestamp'
    CARB_INPUT = 'BWZ Carb Input (grams)'
    GLUCOSE = 'Sensor Glucose (mg/dL)'
    
    meal_times = insulin_data[insulin_data[CARB_INPUT].notna() & (insulin_data[CARB_INPUT] > 0)]

    meal_data_matrix = []
    for _, meal in meal_times.iterrows():
        tm = meal[TIME]
        start_time = tm - pd.Timedelta(minutes=30)
        end_time = tm + pd.Timedelta(hours=2)
        cgm_time_series = cgm_data[(cgm_data[TIME] >= start_time) & (cgm_data[TIME] <= end_time)]

        # Check for conditions described:
        # 4a. No other meal in the tm to tm + 2 hours period
        later_meals = meal_times[(meal_times[TIME] > tm) & (meal_times[TIME] < end_time)]
        
        if later_meals.empty:
            # Condition 4a: No meal within the next 2 hours, use this stretch as meal data
            meal_data = cgm_time_series
        else:
            # 4b: If there is a meal in between tm and tm + 2 hours, consider that instead
            tp = later_meals.iloc[0][TIME]
            cgm_time_series = cgm_data[(cgm_data[TIME] >= start_time) & (cgm_data[TIME] <= end_time)]
            meal_data = cgm_time_series
        
        # 4c: If there is a meal exactly at tm + 2 hours, adjust the time window
        if not later_meals.empty and later_meals.iloc[0][TIME] == end_time:
            cgm_time_series = cgm_data[(cgm_data[TIME] >= tm + pd.Timedelta(hours=1.5)) & (cgm_data[TIME] <= tm + pd.Timedelta(hours=4))]
            meal_data = cgm_time_series
        
        meal_data_cleaned = handle_missing_data(meal_data[[GLUCOSE]], method=missing_data_method, threshold=threshold)

        if meal_data_cleaned is not None:
            # Ensure each meal stretch has 30 columns (2hr 30 min of data at 5-min intervals)
            if len(meal_data_cleaned) >= 30:
                meal_data_cleaned = meal_data_cleaned.iloc[:30]  # Truncate to 30 rows if there are more than 30
                meal_data_matrix.append(meal_data_cleaned.values.flatten())

    meal_data_matrix = pd.DataFrame(meal_data_matrix)
    return meal_data_matrix


def extract_no_meal_data(insulin_data, cgm_data, missing_data_method='ignore', threshold=0.1):
    # COLUMNS
    TIME = 'Timestamp'
    CARB_INPUT = 'BWZ Carb Input (grams)'
    GLUCOSE = 'Sensor Glucose (mg/dL)'
    
    meal_times = insulin_data[insulin_data[CARB_INPUT].notna() & (insulin_data[CARB_INPUT] > 0)]
    no_meal_data_matrix = []

    for _, meal in meal_times.iterrows():
        tm = meal[TIME] 
        post_absorptive_start = tm + pd.Timedelta(hours=2) 
        post_absorptive_end = post_absorptive_start + pd.Timedelta(hours=2)
        post_absorptive_meals = insulin_data[(insulin_data[CARB_INPUT].notna()) &
                                             (insulin_data[CARB_INPUT] > 0) &
                                             (insulin_data[TIME] > post_absorptive_start) &
                                             (insulin_data[TIME] <= post_absorptive_end)]
        
        if post_absorptive_meals.empty:
            no_meal_data = cgm_data[(cgm_data[TIME] >= post_absorptive_start) & (cgm_data[TIME] < post_absorptive_end)]
            no_meal_data_cleaned = handle_missing_data(no_meal_data[[GLUCOSE]], method=missing_data_method, threshold=threshold)
    
            if no_meal_data_cleaned is not None and len(no_meal_data_cleaned) >= 24:
                no_meal_data_cleaned = no_meal_data_cleaned.iloc[:24]
                no_meal_data_matrix.append(no_meal_data_cleaned.values.flatten())
    
        # Condition 2b: If a meal found in post-absorptive period has 0 carbs, ignore this meal stretch and use no meal data
        elif post_absorptive_meals.iloc[0][CARB_INPUT] == 0:
            no_meal_data = cgm_data[(cgm_data[TIME] >= post_absorptive_start) & (cgm_data[TIME] < post_absorptive_end)]
            no_meal_data_cleaned = handle_missing_data(no_meal_data[[GLUCOSE]], method=missing_data_method, threshold=threshold)
    
            if no_meal_data_cleaned is not None and len(no_meal_data_cleaned) >= 24:
                no_meal_data_cleaned = no_meal_data_cleaned.iloc[:24] 
                no_meal_data_matrix.append(no_meal_data_cleaned.values.flatten())

    no_meal_data_matrix = pd.DataFrame(no_meal_data_matrix)
    return no_meal_data_matrix

def save_data_to_pickle(data, filename):
    """Save data to a pickle file."""
    with open(filename, 'wb') as f:
        pickle.dump(data, f)
    print(f"Data saved to {filename}")

def load_data_from_pickle(filename):
    """Load data from a pickle file."""
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    print(f"Data loaded from {filename}")
    return data

def save_method(method):
    insulin_data, cgm_data = extract_base_data()
    insulin_data = insulin_data.copy()
    cgm_data = cgm_data.copy()
    
    meal_data_matrix = extract_meal_data(insulin_data, cgm_data, missing_data_method=method)
    meal_filename = f"meal_data_{method}.pkl"
    save_data_to_pickle(meal_data_matrix, meal_filename)
    
    no_meal_data_matrix = extract_no_meal_data(insulin_data, cgm_data, missing_data_method=method)
    no_meal_filename = f"no_meal_data_{method}.pkl"
    save_data_to_pickle(no_meal_data_matrix, no_meal_filename)
    
    print(f"Saved meal and no-meal data for method: {method}")

def iterate_and_save_methods(method='all'):
    """
    Iterate over missing data methods, extract meal and no-meal data,
    and save to pickle files for later use.
    """
    if method == 'all':
        for method in ['knn', 'linear', 'polynomial', 'ignore']:
            save_method(method)
    else:
        save_method(method)

iterate_and_save_methods('knn')

Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Insufficient data for KNN imputation, skipping...
Data saved to meal_data_knn.pkl
Insufficient data for KNN imputation, skipping...
Insufficient data 

In [91]:
from scipy.stats import skew, kurtosis
from scipy.fft import fft

def load_data_matrices(method):
    meal_filename = f"meal_data_{method}.pkl"
    with open(meal_filename, 'rb') as file:
        meal_data_matrix = pickle.load(file)

    no_meal_filename = f"no_meal_data_{method}.pkl"
    with open(no_meal_filename, 'rb') as file:
        no_meal_data_matrix = pickle.load(file)

    meal_data_df = pd.DataFrame(meal_data_matrix)
    no_meal_data_df = pd.DataFrame(no_meal_data_matrix)

    return meal_data_df, no_meal_data_df

def feature_extractor(data_matrix, time_interval=5):
    feature_matrix = pd.DataFrame()
    
    feature_matrix['mean'] = data_matrix.mean(axis=1)
    feature_matrix['std'] = data_matrix.std(axis=1)
    feature_matrix['rate_of_change'] = data_matrix.diff(axis=1).mean(axis=1)
    feature_matrix['time_to_peak'] = data_matrix.idxmax(axis=1) * time_interval
    feature_matrix['auc'] = np.trapz(data_matrix, axis=1)
    feature_matrix['fft_energy'] = np.abs(np.fft.fft(data_matrix, axis=1)).mean(axis=1)
    feature_matrix['cv_glucose'] = data_matrix.std(axis=1) / data_matrix.mean(axis=1)
    feature_matrix['peak_amplitude'] = data_matrix.max(axis=1) - data_matrix.min(axis=1)
    feature_matrix['time_above_180'] = (data_matrix > 180).mean(axis=1)
    feature_matrix['skewness'] = data_matrix.apply(lambda x: skew(x, axis=0), axis=1)
    feature_matrix['kurtosis'] = data_matrix.apply(lambda x: kurtosis(x, axis=0), axis=1)
    feature_matrix['energy'] = (data_matrix ** 2).sum(axis=1)
    return feature_matrix



In [104]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

def train_machine_with_kfold(meal_feature_matrix, no_meal_feature_matrix, missing_data_method='mean', k=5):
    X = np.vstack([meal_feature_matrix, no_meal_feature_matrix])
    y = np.hstack([np.ones(len(meal_feature_matrix)), np.zeros(len(no_meal_feature_matrix))])

    if missing_data_method == 'mean':
        imputer = SimpleImputer(strategy='mean')
        X_imputed = imputer.fit_transform(X)
    elif missing_data_method == 'median':
        imputer = SimpleImputer(strategy='median')
        X_imputed = imputer.fit_transform(X)
    elif missing_data_method == 'most_frequent':
        imputer = SimpleImputer(strategy='most_frequent')
        X_imputed = imputer.fit_transform(X)
    elif missing_data_method == 'remove':
        X_imputed = X[~np.isnan(X).any(axis=1)]
        y = y[~np.isnan(X).any(axis=1)] 
    else:
        raise ValueError("Invalid missing data method. Choose from 'mean', 'median', 'most_frequent', or 'remove'.")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled , y, test_size=0.2, random_state=42)
    param_grid = {'C': [0.001,0.005,0.009,0.01]}
    
    model = SVC(kernel='linear', class_weight='balanced')
    
    grid_search = GridSearchCV(model, param_grid, cv=k, scoring='accuracy')
    grid_search.fit(X_scaled, y)
    
    best_model = grid_search.best_estimator_
    best_C = grid_search.best_params_['C']
    
    print(f"Best C: {best_C}")
    print(f"Best cross-validation score: {grid_search.best_score_}")
    best_model.fit(X_train, y_train)
    final_accuracy = best_model.score(X_test, y_test)
    print(f"Test set accuracy with best C: {final_accuracy:.4f}")
    
    return best_model

def train_with_missing_method(method, missing_data_method):
    print(f"processing {method}")
    meal_data_df, no_meal_data_df = load_data_matrices(method)
    meal_features = feature_extractor(meal_data_df)
    no_meal_features = feature_extractor(no_meal_data_df)
    model = train_machine_with_kfold(meal_features, no_meal_features, missing_data_method='remove')
    with open(f'trained_model_{method}.pkl', 'wb') as file:
        pickle.dump(model, file)
    print(f"######## ending {method}")

#missing_data_methods = ['ignore', 'linear', 'knn', 'polynomial']
#missing_data_method_two = ['remove', 'mean', 'median', 'most_frequent']
train_with_missing_method('knn', missing_data_method='remove')

processing knn
Best C: 0.01
Best cross-validation score: 0.9258431141127359
Test set accuracy with best C: 0.9406
######## ending knn


In [93]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from train import feature_extractor

def process_test_csv(test_csv_path, model_pickle_file, result_csv_path):
    with open(model_pickle_file, "rb") as file:
        model = pickle.load(file)
    
    test_data_frame = pd.read_csv(test_csv_path, header=None)
    
    test_features = feature_extractor(test_data_frame)
    
    pd.DataFrame(model.predict(test_features)).to_csv(result_csv_path, header=None, index=False)


process_test_csv('./test.csv', './trained_model_knn.pkl', 'Result.csv')

ModuleNotFoundError: No module named 'train'