# Importing the Required Modules

In [None]:
# Standard Libraries
import re
import pickle

# Data Libraries
import pandas as pd
import numpy as np

# Functionality
from typing import List, Dict, Union

In [None]:
from torch.utils.data.dataset import Dataset
from torchvision import transforms
import torchvision.models as models
import torch.optim as optim
import torch.nn as nn
import torch
import time
import os
import copy
import argparse

In [None]:
from pathlib import Path
PATH=Path("drive/MyDrive/ACS_AI_A1/") 

In [None]:
PATH_SHARED=Path("drive/MyDrive/research/") 
!ls $PATH_SHARED

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Loading the Dataset

In [None]:
data_news_all = pd.read_csv(f"{PATH}/Research/Combined/sarcastic_nonsarcastic_img.csv")

In [None]:
images = data_news_all['photo_path'].values
label = data_news_all['sarcastic'].values

In [None]:
images, image_rem, label, label_rem = train_test_split(images, label, train_size=0.13, random_state=42)

In [None]:
image_train, image_rem, label_train, label_rem = train_test_split(images, label, train_size=0.8, random_state=42) 
image_valid, image_test, label_valid, label_test = train_test_split(image_rem, label_rem, test_size=0.6, random_state=42) 

In [None]:
print('Shape of training data: ')
print(image_train.shape)
print(label_train.shape)

print('Shape of val data: ')
print(image_valid.shape)
print(label_valid.shape)

print('Shape of test data: ')
print(image_test.shape)
print(label_test.shape)

In [None]:
dataset_train = {
    "photo_path": image_train,
    "image_label": label_train
}
dataframe_train = pd.DataFrame(dataset_train)

In [None]:
dataset_test = {
    "photo_path": image_test,
    "image_label": label_test
}
dataframe_test = pd.DataFrame(dataset_test)

In [None]:
dataset_valid = {
    "photo_path": image_valid,
    "image_label": label_valid
}
dataframe_valid = pd.DataFrame(dataset_valid)

In [None]:
from PIL import Image
class CustomDatasetFromCSV(Dataset):
    def __init__(self, csv, transforms=None):
        self.data = csv #pd.read_csv(csv)
        self.labels = np.asarray(self.data.iloc[:, 1])
        self.transforms = transforms

    def __getitem__(self, index):
        single_image_label = self.labels[index]
        single_image_path = self.data.photo_path[index]
        
        
        im_as_im = Image.open(rf"{single_image_path}")
        
        img_as_np = np.asarray(im_as_im)

        img_as_img = Image.fromarray(img_as_np.astype(np.uint8))
        img_as_img = img_as_img.convert('RGB')
        if self.transforms is not None:
            img_as_tensor = self.transforms(img_as_img)
        return (img_as_tensor, single_image_label)

    def __len__(self):
        return len(self.data.index)

In [None]:
transformations = transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])


In [None]:
train_dataset = \
    CustomDatasetFromCSV(dataframe_train, transformations)
loader_train = torch.utils.data.DataLoader(dataset=train_dataset,
                                                    batch_size=10,
                                                    shuffle=False)
test_dataset = \
    CustomDatasetFromCSV(dataframe_test, transformations)
loader_test = torch.utils.data.DataLoader(dataset=test_dataset,
                                                    batch_size=10,
                                                    shuffle=False)
valid_dataset = \
    CustomDatasetFromCSV(dataframe_valid, transformations)
loader_valid = torch.utils.data.DataLoader(dataset=valid_dataset,
                                                    batch_size=10,
                                                    shuffle=False)

# Resnet

In [None]:
# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
# Top level data directory. Here we assume the format of the directory conforms 
#   to the ImageFolder structure
data_dir = f"{PATH_SHARED}/images"


# Models to choose from [resnet, alexnet, vgg, squeezenet, densenet, inception]
model_name = "resnet"

# Number of classes in the dataset
num_classes = 2

# Batch size for training (change depending on how much memory you have)
batch_size = 64

# Number of epochs to train for 
num_epochs = 3

# Flag for feature extracting. When False, we finetune the whole model, 
#   when True we only update the reshaped layer params
feature_extract = True

In [None]:
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
  param.requires_grad = False

num_ftrs = resnet50.fc.in_features
resnet50.fc = nn.Linear(num_ftrs, 2)
input_size = 224

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

In [None]:
optimizer = optim.SGD(resnet50.parameters(), lr=0.001, momentum=0.9)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=2, phase = "train", verbose=True):
    since = time.time()

    val_acc_history = []
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        model.train() 


        running_loss = 0.0
        running_corrects = 0

        # Iterate over data.
        for inputs, labels in dataloaders:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            _, preds = torch.max(outputs, 1)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(dataloaders.dataset)
        epoch_acc = running_corrects.double() / len(dataloaders.dataset)

        print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [None]:
def evaluate_loop(model, test_dataset):

    label_pred = []
    label_true = []
    for i in range(test_dataset.__len__()):

        label_outputs = model(test_dataset.__getitem__(i)[0].view(1, 3, 224, 224))
        _, label_argmax = torch.max(label_outputs, 1)

        
        label_pred.append(label_argmax.squeeze().cpu().numpy())
        label_true.append(test_dataset.__getitem__(i)[1])



    preds = {
        'label': {
            'pred': label_pred,
            'true': label_true
        }
    }
    results = {name: {} for name in preds}

    pred_name = 'label'
    pred = preds[pred_name]['pred']
    true = preds[pred_name]['true']


    results[pred_name]['accuracy'] = metrics.accuracy_score(true, pred)
    results[pred_name]['f1_score'] = metrics.f1_score(true, pred, average='macro')
    results[pred_name]['precision'] = metrics.precision_score(true, pred, average='macro')
    results[pred_name]['recall'] = metrics.recall_score(true, pred, average='macro')
    results[pred_name]['confusion_matrix'] = metrics.confusion_matrix(true, pred)
    results[pred_name]['report'] = metrics.classification_report(true, pred)

    return results

In [None]:
def predict(model, test_image_tensor):
    with torch.no_grad():
        model.eval()
        # Model outputs log probabilities
        out = model(test_image_tensor)
        ps = torch.exp(out)
        topk, topclass = ps.topk(1, dim=1)
        print("Output class :  ", topclass.cpu().numpy()[0][0]) #topclass.cpu().numpy()[0][0]
    return topclass #cpu().numpy()[0][0]

In [None]:
model, val_acc_history = train_model(resnet50, loader_train, criterion, optimizer, num_epochs=3)

Epoch 0/2
----------
train Loss: 0.5079 Acc: 0.7545

Epoch 1/2
----------
train Loss: 0.4196 Acc: 0.8115

Epoch 2/2
----------
train Loss: 0.3918 Acc: 0.8237

Training complete in 67m 60s
Best val Acc: 0.000000


## Saving the model

In [None]:
torch.save(model.state_dict(), f"{PATH}/Research/Resnet/weights_cpu.h5")

In [None]:
torch.save(model, f"{PATH}/Research/Resnet/model_cpu.pth")

In [None]:
model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save(f"{PATH}/Research/Resnet/model_cpu.pt") # Save


In [None]:
model = torch.load(PATH)
model.eval()

## Evaluate results

In [None]:
results = evaluate_loop(model, test_dataset)
results

{'label': {'accuracy': 0.4751066856330014,
  'confusion_matrix': array([[119, 238],
         [131, 215]]),
  'f1_score': 0.465132486448258,
  'precision': 0.47530684326710815,
  'recall': 0.47736030828516374,
  'report': '              precision    recall  f1-score   support\n\n           0       0.48      0.33      0.39       357\n           1       0.47      0.62      0.54       346\n\n    accuracy                           0.48       703\n   macro avg       0.48      0.48      0.47       703\nweighted avg       0.48      0.48      0.46       703\n'}}