In [1]:
! tar -xf DEMO1_Resume_Images.zip

In [2]:
import torch
import torch.nn as nn
from torch.optim import ASGD, SGD
from torch.optim.lr_scheduler import CyclicLR, MultiStepLR, StepLR
from torch.utils.data import DataLoader

import torchvision.models as vision_models

from torchvision import datasets, transforms


# to convert image to PIL format
import cv2
import PIL.Image as Image

import time
import copy
import pandas as pd
import numpy as np
import os



# from pyspark import SparkContext, SQLContext
# from pyspark.sql.functions import UserDefinedFunction, lit, col,\
# monotonically_increasing_id,size, split, udf, when,  lower
# from pyspark.sql.types import StringType, LongType


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv("Resume_Image_Classifier_Data - Sheet3.csv")

In [5]:
df.head(5)

Unnamed: 0,ID,file_name,Format,Content
0,0,0-Cardiovascular_Nurse-Adult_Nurse_Practitione...,fancy,resume
1,1,0-Cardiovascular_Nurse-Adult_Nurse_Practitione...,fancy,resume
2,2,0-Cardiovascular_Nurse-Adult_Nurse_Practitione...,non_fancy,not_resume
3,3,1-Cardiovascular_Nurse-Advanced_Registered_Nur...,fancy,resume
4,4,1-Cardiovascular_Nurse-Advanced_Registered_Nur...,non_fancy,not_resume


In [6]:
train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 1234)

In [7]:
# train_df.loc[:, 'file_name'] = train_df['file_name'].apply(lambda x: x.lower() )
# test_df.loc[:, 'file_name'] = test_df['file_name'].apply(lambda x: x.lower() )

Cardiovascular_nurses_train_filter = train_df[train_df['file_name'].apply(lambda x: x.lower() ).str.contains('cardiovascular_nurse')]
Cardiovascular_nurses_train = Cardiovascular_nurses_train_filter.to_dict('records')

Certified_nurses_train_filter = train_df[train_df['file_name'].apply(lambda x: x.lower() ).str.contains('certified_registered_nurse')]
Certified_nurses_train = Certified_nurses_train_filter.to_dict('records')

Clinical_nurses_train_filter = train_df[train_df['file_name'].apply(lambda x: x.lower() ).str.contains('clinical_nurse_specialist')]
Clinical_nurses_train = Clinical_nurses_train_filter.to_dict('records')


Cardiovascular_nurses_test_filter = test_df[test_df['file_name'].apply(lambda x: x.lower() ).str.contains('cardiovascular_nurse')]
Cardiovascular_nurses_test = Cardiovascular_nurses_test_filter.to_dict('records')

Certified_nurses_test_filter = test_df[test_df['file_name'].apply(lambda x: x.lower() ).str.contains('certified_registered_nurse')]
Certified_nurses_test = Certified_nurses_test_filter.to_dict('records')

Clinical_nurses_test_filter = test_df[test_df['file_name'].apply(lambda x: x.lower() ).str.contains('clinical_nurse_specialist')]
Clinical_nurses_test = Clinical_nurses_test_filter.to_dict('records')

In [8]:
train_df.head()

Unnamed: 0,ID,file_name,Format,Content
281,281,216-Clinical_Nurse_specialist-Clinical_Instruc...,non_fancy,resume
73,73,127-Certified_Registered_Nurse_Anesthetist-Cer...,non_fancy,resume
547,547,48-Cardiovascular_Nurse-Nurse_Practitioner(2)_...,non_fancy,resume
62,62,122-Certified_Registered_Nurse_Anesthetist-Cer...,non_fancy,not_resume
414,414,262-Clinical_Nurse_specialist-Medical_Manageme...,fancy,resume


In [9]:
cardiovascular_img_path = 'DEMO1_Resume_Images/Cardiovascular_Nurse/'

certified_img_path = 'DEMO1_Resume_Images/Certified_Registered_Nurse_Anesthetist/'

clinical_img_path = 'DEMO1_Resume_Images/Clinical_Nurse_specialist/'



train_data_structure = [(cardiovascular_img_path, Cardiovascular_nurses_train),
                 (certified_img_path, Certified_nurses_train),
                 (clinical_img_path, Clinical_nurses_train)]



test_data_structure = [(cardiovascular_img_path, Cardiovascular_nurses_test),
                 (certified_img_path, Certified_nurses_test),
                 (clinical_img_path, Clinical_nurses_test)]






data_transforms = {
    
    'train': transforms.Compose([
        transforms.Resize((512,256)),
        
        transforms.ToTensor()       
        ,
        transforms.Normalize([0.4122], [0.3845], [0.3541])
        ,
        
    ]),
    'val': transforms.Compose([

        transforms.Resize((512,256)),
        transforms.ToTensor()
        ,
        transforms.Normalize([0.2254], [0.1984], [0.1752])
        
    ]),
}

actual_labels = [("fancy","resume"),
                ("fancy", "not_resume"),
                ("non_fancy", "resume"),
                ("non_fancy", "not_resume")]


mlb = MultiLabelBinarizer()

mlb.fit(actual_labels)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [10]:
mlb.classes_

array(['fancy', 'non_fancy', 'not_resume', 'resume'], dtype=object)

In [11]:


def get_array_rep(data_structure, mode = 'train'):
    
    if mode is 'train':
      
      nrows = len(train_df)
      
      features = torch.empty(*(nrows, 3, 512, 256), dtype = torch.float)

      labels = torch.empty(*(nrows,4), dtype = torch.float)
      
      
    elif mode is 'val':
        
        nrows = len(test_df)
        
        features = torch.empty(*(nrows, 3, 512, 256), dtype = torch.float)
        
        labels = torch.empty(*(nrows, 4), dtype = torch.float)
        

    ind = 0
    
    for (file_path, records) in data_structure:
      
      
        for rec in records:
            
            if os.path.exists(rec['file_name']):
            
                img = Image.open(file_path  + rec['file_name'] ).convert('RGB')

                img = data_transforms[mode](img)

                classes = ( rec['Format'], rec['Content'] )

                features[ind] = img

                labels[ind] = torch.from_numpy( mlb.transform([classes])  )

                ind +=1
            
            
    
    
    return (features,labels  )

In [12]:
train_features, train_targets = get_array_rep(train_data_structure)

In [13]:
len(train_features), type(train_features)

(533, torch.Tensor)

In [14]:
test_features, test_targets  = get_array_rep(test_data_structure, mode  = 'val')

In [15]:
train_df.count()

ID           533
file_name    533
Format       533
Content      533
dtype: int64

In [16]:
train_data = [( train_features[ind], train_targets[ind] ) for ind in range(len(train_df) )]

test_data = [( test_features[ind], test_targets[ind]   ) for ind in range(len(test_df) )]

In [17]:
train_loader = DataLoader(train_data, batch_size = 24, shuffle  = True)


test_loader = DataLoader(test_data, batch_size = 5, shuffle = True)

dataloaders= {'train': train_loader, 'val': test_loader }

In [18]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

#     best_model_wts = copy.deepcopy(model.state_dict())
    
    best_loss = 0.0
    
    
    for phase in dataloaders.keys():
      
        if phase is 'train':

            model.train()

        else:

            model.eval()

            for epoch in range(num_epochs):
                running_loss = 0.0
                running_acc = 0

                for (inputs, labels) in dataloaders[phase]:

                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    optimizer.zero_grad()

                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        
                        
                        loss = criterion(outputs, labels)


                        if phase is 'train':

                            loss.backward()
                            optimizer.step()

                            scheduler.step()
                       
                        running_loss += loss.item() 


                        epoch_loss = running_loss / len( dataloaders[phase] )


                print('{} Loss: {:.4f} '.format(
                      phase, epoch_loss))

                  # deep copy the model
                if phase == 'val' and epoch_loss < best_loss:
                    best_loss = epoch_loss





    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    

    # load best model weights
#     model.load_state_dict(best_model_wts)
    return model
      
      
      
    
    

In [19]:
model_ft = vision_models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 4)

model_ft = model_ft.to(device)


criterion = nn.BCEWithLogitsLoss()



# Observe that all parameters are being optimized
optimizer_ft = SGD(model_ft.parameters(),
                         lr=1e-3, momentum=0.8,
                  nesterov = True)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = StepLR(optimizer_ft,
                                       step_size=4, gamma=0.1)

In [20]:
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=15)

val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
val Loss: nan 
Training complete in 0m 13s
