# Model v1: Early Detection of Tomato Bacterial Leaf Spot

In [None]:
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from smart_agriculture.features import pick_band_idx, ndvi, pri, ndwi

## 1. Load Data and Compute Features

In [None]:
DATA_PROC_DIR = Path('../data_proc')
REPORTS_DIR = Path('../reports')
DASHBOARDS_DIR = Path('../dashboards')
REPORTS_DIR.mkdir(exist_ok=True)
DASHBOARDS_DIR.mkdir(exist_ok=True)
logging.basicConfig(
    filename=REPORTS_DIR / 'trace_log.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
all_spectra = list(DATA_PROC_DIR.glob('*_spectrum.csv'))
data = []
for spectrum_file in all_spectra:
    df = pd.read_csv(spectrum_file)
    wavelengths = df['wavelength'].values
    reflectance = df['reflectance'].values
    
    # Get features
    nir = reflectance[pick_band_idx(wavelengths, 800)]
    red = reflectance[pick_band_idx(wavelengths, 670)]
    b531 = reflectance[pick_band_idx(wavelengths, 531)]
    b570 = reflectance[pick_band_idx(wavelengths, 570)]
    swir = reflectance[pick_band_idx(wavelengths, 1650)]
    
    # Compute indices
    ndvi_val = ndvi(nir, red)
    pri_val = pri(b531, b570)
    ndwi_val = ndwi(nir, swir)
    
    sample_id = spectrum_file.stem.replace('_spectrum', '')
    timepoint = sample_id.split('_')[-2] # Infer from filename
    data.append([sample_id, timepoint, ndvi_val, pri_val, ndwi_val])

feature_df = pd.DataFrame(data, columns=['sample_id', 'timepoint', 'ndvi', 'pri', 'ndwi'])
logging.info('Features computed for all samples.')
print('Features computed.')

## 2. Label Mapping and Data Splitting

In [None]:
def map_label(timepoint):
    if 'before' in timepoint:
        return 'Healthy'
    if '2hr' in timepoint:
        return 'Early'
    if 'D' in timepoint and int(timepoint[1:]) >= 1:
        return 'Infected'
    return 'Unknown'

feature_df['label'] = feature_df['timepoint'].apply(map_label)
feature_df = feature_df[feature_df['label'] != 'Unknown']

X = feature_df[['ndvi', 'pri', 'ndwi']]
y = feature_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
logging.info(f'Data split into training and testing sets. Train size: {len(X_train)}, Test size: {len(X_test)}')
print('Data split.')

## 3. Train and Evaluate Linear SVM Model

In [None]:
model = SVC(kernel='linear', probability=True, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] # Prob of 'Infected' class

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
# roc_auc requires binary or multiclass, let's make it binary for simplicity here
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'ROC-AUC Score: {roc_auc:.4f}')

logging.info(f'Model evaluation complete. Accuracy: {accuracy:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}')

## 4. Export Results

In [None]:
dashboard_df = feature_df[['sample_id', 'timepoint']].copy()
dashboard_df['prob_infected'] = model.predict_proba(X)[:, 1] # Probabilities for all data

output_path = DASHBOARDS_DIR / 'bls_lab_view.csv'
dashboard_df.to_csv(output_path, index=False)

logging.info(f'Dashboard data exported to {output_path}')
print(f'Dashboard data exported to {output_path}')