# Logistic regression predictor based on STFT
Averaging the short-term Fourier transforms for each PSG-labeled time segment, we build a logistic regression model with extremely high precision and recall: approximately 94% and 99%, respectively. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

import glob

# My set of tools for this project
import sleep_an_tools as st
from avg_tfrms import make_avg_tfrms as mat

### 0. Collect data from files and compute averaged STFTs. 

In [2]:
def get_subjects():
    subject_data = [f.split("/")[-1] for f in glob.glob("data/motion/*.txt")]
    subject_data = [f[:f.find("_")] for f in subject_data]
    
    return subject_data

In [3]:
subs = get_subjects()

In [4]:
# Compute STFT for each time segment classified as asleep or awake, then average over that window.
# Thus, if for subject S there are N blocks of 30 seconds labeled as awake, and M asleep, mat(S) 
# will contain two arrays of length N and M, respectively.
stft_avgs = [mat(n) for n in subs]

In [5]:
# This should be 31, the number of subjects in the study
len(stft_avgs)

31

In [6]:
# The data we currently have is organized into a nested list of depth 3.
# Unpack this now, so the classifier can be built.
all_subjects_X = []
all_subjects_y = []
for subject in stft_avgs:
    for klass in [0, 1]:
        for ft in subject[klass]:
            all_subjects_X.append(ft)
            all_subjects_y.append(klass)

### 1. Build the logistic regression model.

In [7]:
# We will use stochastic gradient descent to fit the logistic regression model. 
# Per Scikit-Learn's recommendation, this works best when the data has mean 0 and variance 1.
standardizer = StandardScaler()
all_subjects_X_std = standardizer.fit_transform(np.array(all_subjects_X))

In [8]:
# Do an 80/20 train/test split to evaluate our model
X_train, X_test, y_train, y_test = train_test_split(all_subjects_X_std, all_subjects_y, test_size=0.2)

In [9]:
# Log-loss for logistic regression
sgdc = SGDClassifier(loss='log')
sgdc.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

### 2. Evaluate the model's precision and recall.

In [10]:
y_test_pred = sgdc.predict(X_test)

In [11]:
prec, rec = precision_score(y_test, y_test_pred), recall_score(y_test, y_test_pred)

In [12]:
prec

0.9370302660979077

In [13]:
rec

0.9863160145392346