Resolve nan issues for #127 (hopefully)

cmusatyalab · Jun 13, 2016 · fea2fb4 · fea2fb4
1 parent aaab8fe
commit fea2fb4
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 11 deletions.
diff --git a/training/main.lua b/training/main.lua
@@ -17,11 +17,6 @@ if opt.cuda then
    cutorch.setDevice(1)
 end
 
-if opt.cudnn then
-   print("\nThere is a known issue with cudnn training! See:")
-   print("  https://github.com/cmusatyalab/openface/issues/127\n\n")
-end
-
 torch.save(paths.concat(opt.save, 'opts.t7'), opt, 'ascii')
 print('Saving everything to: ' .. opt.save)
 

diff --git a/training/model.lua b/training/model.lua
@@ -38,7 +38,7 @@ if opt.cuda then
    criterion:cuda()
 end
 
--- optimizeNet(model, opt.imgDim)
+optimizeNet(model, imgDim)
 
 print('=> Model')
 print(model)

diff --git a/training/train.lua b/training/train.lua
@@ -87,13 +87,34 @@ function train()
 
    collectgarbage()
 
-   local nnModel = model:float():clearState()
+   -- Fix nans from https://github.com/cmusatyalab/openface/issues/127
+   local function fixNans(x, tag)
+      local I = torch.ne(x,x)
+      if torch.any(I) then
+         print("Correcting NaNs in: ", tag)
+         x[I] = 0.0
+      end
+   end
+
+   for i, mod in ipairs(model:listModules()) do
+      if torch.typename(mod) == 'nn.SpatialBatchNormalization' then
+         fixNans(mod.running_mean, string.format("%d-%s-%s", i, mod, 'running_mean'))
+         fixNans(mod.running_var, string.format("%d-%s-%s", i, mod, 'running_var'))
+      end
+   end
+
    if opt.cudnn then
-      cudnn.convert(nnModel, nn)
+      cudnn.convert(model, nn)
    end
+   model = model:float():clearState()
 
-   torch.save(paths.concat(opt.save, 'model_' .. epoch .. '.t7'), nnModel)
+   torch.save(paths.concat(opt.save, 'model_' .. epoch .. '.t7'), model)
    torch.save(paths.concat(opt.save, 'optimState_' .. epoch .. '.t7'), optimState)
+
+   model = model:cuda()
+   if opt.cudnn then
+      cudnn.convert(model, cudnn)
+   end
    collectgarbage()
 end -- of train()
 

diff --git a/training/util.lua b/training/util.lua
@@ -58,14 +58,15 @@ end
 --Reduce the memory consumption by model by sharing the buffers
 function optimizeNet( model, inputSize )
    local optnet_loaded, optnet = pcall(require,'optnet')
-   if  optnet_loaded then
+   if optnet_loaded then
       local opts   = {inplace=true, mode='training', removeGradParams=false}
       local input  = torch.Tensor(1,3,inputSize,inputSize)
       if opt.cuda then
           input = input:cuda()
       end
       optnet.optimizeMemory(model, input, opts)
    else
-      print("'optnet' package not found, please install it to reduce the memory consumption, repo https://github.com/fmassa/optimize-net")
+      print("'optnet' package not found, install it to reduce the memory consumption.")
+      print("Repo: https://github.com/fmassa/optimize-net")
    end
 end