In [118]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import os
import glob
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

%matplotlib inline

In [113]:
train_path = 'aerial-cactus-identification/train/'
classes = (0,1)
num_classes = len(classes)

In [114]:
df_train = pd.read_csv('aerial-cactus-identification/train.csv')

In [115]:
df_train.head(2)

Unnamed: 0,id,has_cactus
0,0004be2cfeaba1c0361d39e2b000257b.jpg,1
1,000c8a36845c0208e833c79c1bffedd1.jpg,1


In [178]:
class cactusTrainData(Dataset): 
    def __init__(self, df, train_path = 'aerial-cactus-identification/train/', transform=None):
        self.data = df
        self.train_path = train_path
        self.transform = transform
        
    def __getitem__(self, index):
        img_path = os.path.join(self.train_path,self.data['id'].values[index])
        img = Image.open(img_path)
        label = self.data['has_cactus'].values[index].tolist()
        if self.transform is not None:
            img = self.transform(img)
        else:
            img_tensor = transforms.ToTensor()
            img = img_tensor(img)
        return img, label
        
    def __len__(self):
        return len(self.data)

In [179]:
class cactusTestData(Dataset): 
    def __init__(self, img_path = 'aerial-cactus-identification/test/*', transform=None):
        self.img_path = glob.glob(img_path)
        self.transform = transform
        
    def __getitem__(self, index):
        img = Image.open(self.img_path[index])
        *_, label = self.img_path[index].split('/')
        if self.transform is not None:
            img = self.transform(img)
        else:
            img_tensor = transforms.ToTensor()
            img = img_tensor(img)
        return img, label
        
    def __len__(self):
        return len(self.img_path)

In [180]:
trainset = cactusTrainData(df_train)
testset = cactusTestData()

In [102]:
#trainset, valset = train_test_split(trainset, test_size=0.2, random_state=42)

In [146]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=0)

In [181]:
#testloader = torch.utils.data.DataLoader(valset, batch_size=4, shuffle=True, num_workers=0)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=0)

In [148]:
class NeuNet(nn.Module):
    def __init__(self):
        super(NeuNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=12, kernel_size=3, padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(in_channels=12, out_channels=24, kernel_size=3, padding=1)
        self.relu2 = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2)
        self.fc1 = nn.Linear(in_features= 8 * 8 * 24, out_features=10)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(in_features=10, out_features=2)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool(x)
        x = x.view(-1, 8 * 8 * 24)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        return x

In [149]:
net = NeuNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [150]:
for epoch in range(2):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        img, label = data
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(img)
        #print(outputs.type())
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

[1,  2000] loss: 0.458
[1,  4000] loss: 0.215
[2,  2000] loss: 0.164
[2,  4000] loss: 0.140
Finished Training


In [None]:
class_correct = list(0. for i in range(num_classes))
class_total = list(0. for i in range(num_classes))
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs, 1)
        c = (predicted == labels).squeeze()
        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i].item()
            class_total[label] += 1


for i in range(num_classes):
    print('Accuracy of %5s : %2d %%' % (classes[i], 100 * class_correct[i] / class_total[i]))

In [349]:
pre_sub = []
file_sub = []
with torch.no_grad():
    for data in testloader:
        images, label = data
        outputs = net(images)
        _, predicted = torch.min(outputs, 1)
        for file, pre in zip(label, predicted.tolist()):
            pre_sub.append(pre)
            file_sub.append(file)

In [350]:
submission_dict = {'id':file_sub,'has_cactus':pre_sub}

In [351]:
df_sub = pd.DataFrame(submission_dict)
df_sub.set_index('id', inplace=True)
#df_sub.to_csv('first_submission.csv')

In [352]:
#iter(testloader).next()[0].shape

In [353]:
#iter(testloader).next()[0].detach().numpy()

In [354]:
#plt.imshow(iter(testloader).next()[0].permute(1, 2, 0))

In [355]:
df_sub.sort_index(inplace=True)

In [356]:
df_compare = pd.read_csv('alex_submission.csv')

In [357]:
df_compare.loc[df_compare['has_cactus'] >= 0.5,'has_cactus'] = 1
df_compare.loc[df_compare['has_cactus'] < 0.5,'has_cactus'] = 0

In [358]:
df_compare.set_index('id', inplace=True)
df_compare.sort_index(inplace=True)

In [365]:
df_tmp = df_sub == df_compare

In [360]:
#df_sub.equals(df_compare)

False

In [369]:
df_tmp[df_tmp['has_cactus']==False]

Unnamed: 0_level_0,has_cactus
id,Unnamed: 1_level_1
0127044dfc88dfaed0118c8764909800.jpg,False
02e8d60f2699f2fbf1611edbe1657c10.jpg,False
02f33fce768ce764cea1a14c5ef0fe1b.jpg,False
03575f3071e381afa7ce6cee4aed9193.jpg,False
036feabe3bb928071be0308c61f387f3.jpg,False
...,...
fabca50fa61ab8561038974fd383bca9.jpg,False
fb96e5cef979ba23010129acfc05a29a.jpg,False
fd6fc9105684632afc853c4c8542026d.jpg,False
fdd85be2ebe14d32613e48dd3af2dbf1.jpg,False


In [378]:
print(str(100 -len(df_tmp[df_tmp['has_cactus']==False])/len(df_sub)*100) + '%')

95.2%
