diff --git a/.gitignore b/.gitignore
index 6f7b489..3942a71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -101,3 +101,6 @@ ENV/
 .mypy_cache/
 
 *.pth
+
+# vim
+*.swp
diff --git a/bbox.py b/bbox.py
index 901ba62..d4abe60 100644
--- a/bbox.py
+++ b/bbox.py
@@ -30,27 +30,39 @@ def bboxloginv(dx,dy,dw,dh,axc,ayc,aww,ahh):
     x1,x2,y1,y2 = xc-ww/2,xc+ww/2,yc-hh/2,yc+hh/2
     return x1,y1,x2,y2
 
-def nms(dets, thresh):
-    if 0==len(dets): return []
-    x1,y1,x2,y2,scores = dets[:, 0],dets[:, 1],dets[:, 2],dets[:, 3],dets[:, 4]
+def nms(bboxlist:torch.Tensor, thresh:float) -> list:
+    """Given an Nx5 tensor of bounding boxes, and a threshold,
+    return a list of the indexes of bounding boxes to keep.
+    """
+    if len(bboxlist) == 0: 
+        return []
+    x1 = bboxlist[:,0]
+    y1 = bboxlist[:,1]
+    x2 = bboxlist[:,2]
+    y2 = bboxlist[:,3]
+    scores = bboxlist[:,4]
     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-    order = scores.argsort()[::-1]
 
+    # Go through the boxes in order of decreasing score...
+    scores = np.asarray(scores)
+    order = scores.argsort()
+    order = np.asarray(list(order[::-1]))
     keep = []
-    while order.size > 0:
+    while len(order) > 0:
         i = order[0]
-        keep.append(i)
+        keep.append(i)  # Keep this one.
+
+        # For all the remaining (lower score) bounding boxes, figure out something about the overlap I think
         xx1,yy1 = np.maximum(x1[i], x1[order[1:]]),np.maximum(y1[i], y1[order[1:]])
         xx2,yy2 = np.minimum(x2[i], x2[order[1:]]),np.minimum(y2[i], y2[order[1:]])
-
         w,h = np.maximum(0.0, xx2 - xx1 + 1),np.maximum(0.0, yy2 - yy1 + 1)
-        ovr = w*h / (areas[i] + areas[order[1:]] - w*h)
-
+        ovr = w*h / (areas[i] + areas[order[1:]] - w*h)  # looks like the overlap
         inds = np.where(ovr <= thresh)[0]
-        order = order[inds + 1]
+        order = order[inds + 1]  # eliminate the ones that don't meet the threshhold
 
     return keep
 
+
 def encode(matched, priors, variances):
     """Encode the variances from the priorbox layers into the ground truth boxes
     we have matched (based on jaccard overlap) with the prior boxes.
@@ -92,4 +104,4 @@ def decode(loc, priors, variances):
         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
     boxes[:, :2] -= boxes[:, 2:] / 2
     boxes[:, 2:] += boxes[:, :2]
-    return boxes
\ No newline at end of file
+    return boxes
diff --git a/data/test01_output.png b/data/test01_output.png
index 565a57d..eee7032 100644
Binary files a/data/test01_output.png and b/data/test01_output.png differ
diff --git a/detect_faces.py b/detect_faces.py
new file mode 100755
index 0000000..04a53a9
--- /dev/null
+++ b/detect_faces.py
@@ -0,0 +1,73 @@
+import time
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.autograd import Variable
+from typing import List, Tuple
+torch.backends.cudnn.benchmark = True
+
+import os,sys,cv2,random,datetime,time,math
+import argparse
+import numpy as np
+
+from bbox import decode, nms
+
+def detect_faces(net:nn.Module, img:np.ndarray, minscale:int=3, ovr_threshhold:float=0.3,
+                 score_threshhold:float=0.5) -> List[Tuple]:
+    """returns an list of tuples describing bounding boxes: [x1,y1,x2,y2,score].
+    Setting minscale to 0 finds the smallest faces, but takes the longest.
+    """
+    bboxlist = detect(net, img, minscale)
+    keep_idx = nms(bboxlist, ovr_threshhold)
+    bboxlist = bboxlist[keep_idx,:]
+    out = []
+    for b in bboxlist:
+        x1,y1,x2,y2,s = b
+        if s<0.5: 
+            continue
+        out.append((int(x1),int(y1),int(x2),int(y2),s))
+    return out
+
+
+def detect(net:nn.Module, img:np.ndarray, minscale:int=3) -> torch.Tensor:
+    """returns an Nx5 tensor describing bounding boxes: [x1,y1,x2,y2,score].
+    This will have LOTS of similar/overlapping regions.  Need to call bbox.nms to reconcile them.
+    Setting minscale to 0 finds the smallest faces, but takes the longest.
+    """
+    img = img - np.array([104,117,123])
+    img = img.transpose(2, 0, 1)
+    img = img.reshape((1,)+img.shape)
+
+    img = Variable(torch.from_numpy(img).float()).cuda()
+    BB,CC,HH,WW = img.size()
+    olist = net(img)
+
+    bboxlist = []
+    for i in range(minscale, len(olist)//2):
+        ocls = F.softmax(olist[i*2], dim=1).data
+        oreg = olist[i*2+1].data
+        FB,FC,FH,FW = ocls.size() # feature map size
+        stride = 2**(i+2)    # 4,8,16,32,64,128
+        anchor = stride*4
+        # this workload is small enough that it's faster on CPU than GPU (~55ms vs ~65ms)
+        # but most of that time (40ms) is spend moving the data from GPU to CPU. 
+        all_scores = ocls[0,1,:,:].cpu()
+        oreg = oreg.cpu()
+        # instead of running a sliding window, first find the places where score is big enough to bother
+        bigenough = torch.nonzero(all_scores > 0.05)
+        for hindex, windex in bigenough:
+            score = all_scores[hindex,windex]
+            loc = oreg[0,:,hindex,windex].contiguous().view(1,4)
+            axc,ayc = stride/2+windex*stride,stride/2+hindex*stride
+            priors = torch.Tensor([[axc/1.0,ayc/1.0,stride*4/1.0,stride*4/1.0]])
+            variances = [0.1,0.2]
+            box = decode(loc,priors,variances)
+            x1,y1,x2,y2 = box[0]*1.0
+            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
+            bboxlist.append([x1,y1,x2,y2,score])
+    if len(bboxlist) == 0: 
+        bboxlist=torch.zeros((1, 5))
+    bboxlist = torch.Tensor(bboxlist)
+    return bboxlist
+
diff --git a/livecam.py b/livecam.py
new file mode 100755
index 0000000..6033339
--- /dev/null
+++ b/livecam.py
@@ -0,0 +1,53 @@
+import time
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.autograd import Variable
+torch.backends.cudnn.benchmark = True
+
+import os,sys,cv2,random,datetime,time,math
+import argparse
+import numpy as np
+
+import net_s3fd
+from detect_faces import detect_faces
+
+parser = argparse.ArgumentParser(description='PyTorch face detect')
+parser.add_argument('--net','-n', default='s3fd', type=str)
+parser.add_argument('--model', required=True, type=str)
+parser.add_argument('--path', default='CAMERA', type=str)
+
+args = parser.parse_args()
+use_cuda = torch.cuda.is_available()
+
+
+net = getattr(net_s3fd,args.net)()
+net.load_state_dict(torch.load(args.model))
+net.cuda()
+net.eval()
+
+
+if args.path=='CAMERA': 
+    cap = cv2.VideoCapture(0)
+with torch.no_grad():
+    while(True):
+        if args.path=='CAMERA': 
+            ret, img = cap.read()
+        else: 
+            img = cv2.imread(args.path)
+
+        imgshow = np.copy(img)
+        start_time = time.time()
+        bboxlist = detect_faces(net, img, 3)
+        print(f"Running detect_faces took {1000*(time.time() - start_time):.1f}ms.  Found {len(bboxlist)} faces.")
+        for b in bboxlist:
+            x1,y1,x2,y2,s = b
+            cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,255,0),1)
+        cv2.imshow('test',imgshow)
+
+        if args.path=='CAMERA':
+            if cv2.waitKey(1) & 0xFF == ord('q'): break
+        else:
+            cv2.imwrite(args.path[:-4]+'_output.png',imgshow)
+            if cv2.waitKey(0) or True: break
diff --git a/net_s3fd.py b/net_s3fd.py
index 88e77ee..67983da 100755
--- a/net_s3fd.py
+++ b/net_s3fd.py
@@ -18,7 +18,7 @@ def forward(self, x):
         x = x / norm * self.weight.view(1,-1,1,1)
         return x
 
-class s3fd(nn.Module):
+class S3fd_Model(nn.Module):
     def __init__(self):
         super(s3fd, self).__init__()
         self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
@@ -120,3 +120,5 @@ def forward(self, x):
         cls1  = torch.cat([bmax,chunk[3]],dim=1)
 
         return [cls1,reg1,cls2,reg2,cls3,reg3,cls4,reg4,cls5,reg5,cls6,reg6]
+
+s3fd = S3fd_Model
diff --git a/test.py b/test.py
deleted file mode 100755
index 2729657..0000000
--- a/test.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from __future__ import print_function
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-from torch.autograd import Variable
-torch.backends.cudnn.bencmark = True
-
-import os,sys,cv2,random,datetime,time,math
-import argparse
-import numpy as np
-
-import net_s3fd
-from bbox import *
-
-def detect(net,img):
-    img = img - np.array([104,117,123])
-    img = img.transpose(2, 0, 1)
-    img = img.reshape((1,)+img.shape)
-
-    img = Variable(torch.from_numpy(img).float(),volatile=True).cuda()
-    BB,CC,HH,WW = img.size()
-    olist = net(img)
-
-    bboxlist = []
-    for i in range(len(olist)/2): olist[i*2] = F.softmax(olist[i*2])
-    for i in range(len(olist)/2):
-        ocls,oreg = olist[i*2].data.cpu(),olist[i*2+1].data.cpu()
-        FB,FC,FH,FW = ocls.size() # feature map size
-        stride = 2**(i+2)    # 4,8,16,32,64,128
-        anchor = stride*4
-        for Findex in range(FH*FW):
-            windex,hindex = Findex%FW,Findex//FW
-            axc,ayc = stride/2+windex*stride,stride/2+hindex*stride
-            score = ocls[0,1,hindex,windex]
-            loc = oreg[0,:,hindex,windex].contiguous().view(1,4)
-            if score<0.05: continue
-            priors = torch.Tensor([[axc/1.0,ayc/1.0,stride*4/1.0,stride*4/1.0]])
-            variances = [0.1,0.2]
-            box = decode(loc,priors,variances)
-            x1,y1,x2,y2 = box[0]*1.0
-            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
-            bboxlist.append([x1,y1,x2,y2,score])
-    bboxlist = np.array(bboxlist)
-    if 0==len(bboxlist): bboxlist=np.zeros((1, 5))
-    return bboxlist
-
-parser = argparse.ArgumentParser(description='PyTorch face detect')
-parser.add_argument('--net','-n', default='s3fd', type=str)
-parser.add_argument('--model', default='', type=str)
-parser.add_argument('--path', default='CAMERA', type=str)
-
-args = parser.parse_args()
-use_cuda = torch.cuda.is_available()
-
-
-net = getattr(net_s3fd,args.net)()
-if args.model!='' :net.load_state_dict(torch.load(args.model))
-else: print('Please set --model parameter!')
-net.cuda()
-net.eval()
-
-
-if args.path=='CAMERA': cap = cv2.VideoCapture(0)
-while(True):
-    if args.path=='CAMERA': ret, img = cap.read()
-    else: img = cv2.imread(args.path)
-
-    imgshow = np.copy(img)
-    bboxlist = detect(net,img)
-
-    keep = nms(bboxlist,0.3)
-    bboxlist = bboxlist[keep,:]
-    for b in bboxlist:
-        x1,y1,x2,y2,s = b
-        if s<0.5: continue
-        cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,255,0),1)
-    cv2.imshow('test',imgshow)
-
-    if args.path=='CAMERA':
-        if cv2.waitKey(1) & 0xFF == ord('q'): break
-    else:
-        cv2.imwrite(args.path[:-4]+'_output.png',imgshow)
-        if cv2.waitKey(0) or True: break
\ No newline at end of file
diff --git a/test_integration.py b/test_integration.py
new file mode 100644
index 0000000..4cd7a45
--- /dev/null
+++ b/test_integration.py
@@ -0,0 +1,22 @@
+import cv2
+import numpy as np
+import pytest
+import torch
+
+from detect_faces import detect_faces
+from net_s3fd import S3fd_Model
+
+def test_ellen_selfie():
+    model = S3fd_Model()
+    try:
+        state_dict = torch.load("s3fd_convert.pth")
+        model.load_state_dict(state_dict)
+    except:
+        print("Failed to load pre-trained model for test")
+        raise
+    model.cuda()
+    model.eval()
+    with torch.no_grad():
+        img = cv2.imread('data/test01.jpg')
+        faces = detect_faces(model, img)
+    assert len(faces) == 11
diff --git a/wider_eval_pytorch.py b/wider_eval_pytorch.py
index b984299..be4bfc6 100644
--- a/wider_eval_pytorch.py
+++ b/wider_eval_pytorch.py
@@ -16,38 +16,7 @@
 from net_s3fd import s3fd
 from bbox import *
 
-
-def detect(net,img):
-    img = img - np.array([104,117,123])
-    img = img.transpose(2, 0, 1)
-    img = img.reshape((1,)+img.shape)
-
-    img = Variable(torch.from_numpy(img).float(),volatile=True).cuda()
-    BB,CC,HH,WW = img.size()
-    olist = net(img)
-
-    bboxlist = []
-    for i in range(len(olist)/2): olist[i*2] = F.softmax(olist[i*2])
-    for i in range(len(olist)/2):
-        ocls,oreg = olist[i*2].data.cpu(),olist[i*2+1].data.cpu()
-        FB,FC,FH,FW = ocls.size() # feature map size
-        stride = 2**(i+2)    # 4,8,16,32,64,128
-        anchor = stride*4
-        for Findex in range(FH*FW):
-            windex,hindex = Findex%FW,Findex//FW
-            axc,ayc = stride/2+windex*stride,stride/2+hindex*stride
-            score = ocls[0,1,hindex,windex]
-            loc = oreg[0,:,hindex,windex].contiguous().view(1,4)
-            if score<0.05: continue
-            priors = torch.Tensor([[axc/1.0,ayc/1.0,stride*4/1.0,stride*4/1.0]])
-            variances = [0.1,0.2]
-            box = decode(loc,priors,variances)
-            x1,y1,x2,y2 = box[0]*1.0
-            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
-            bboxlist.append([x1,y1,x2,y2,score])
-    bboxlist = np.array(bboxlist)
-    if 0==len(bboxlist): bboxlist=np.zeros((1, 5))
-    return bboxlist
+from detect_faces import detect
 
 def flip_detect(net,img):
     img = cv2.flip(img, 1)
@@ -134,4 +103,4 @@ def scale_detect(net,img,scale=2.0,facesize=None):
             x1,y1,x2,y2,s = b
             f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(x1,y1,(x2-x1+1),(y2-y1+1),s))
         f.close()
-        print('event:%d num:%d' % (index + 1, num + 1))
\ No newline at end of file
+        print('event:%d num:%d' % (index + 1, num + 1))