# ResNet Transfer for NHTS Images

This script practices the following things in PyTorch. \
1) data preprocessing with data loader. \
2) Simple but quick visualization tools. \
3) ResNet transfer learning for RGB images. \
Shenhao completed these tasks. He also found that the RGB satellite images look like random noise...

Finding: 
ResNet with pretraining (finetuning the last layer vs. training the whole network). Performance: baseline + 1% \
ResNet with pretraining and allowing the training of the whole network. Severe overfitting in 20 epoches: baseline - 4%. Perfect in-sample performance, but bad out-of-sample performance. \
Naive CNN models for the RGB and BW images, the performance improve by about 2% (from 37% to 39%). \


In [1]:
# ! pip3 list

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import util
from scipy import stats
import copy

# torch model
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torchvision
from torchvision import datasets, models, transforms
from PIL import Image

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

In [4]:
import statsmodels.api as sm
# import statsmodels

In [5]:
# ALWAYS choose devise first.
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Helper Functions

In [11]:
def initialize_data(image_type, output_var, output_type, input_var, BE_var, num_categories, size):
    # outputs: randonmized training and testing sets for NHTS, BE, images, and y.
    
    ### read image array
    if image_type == 'rgb':
        image_array_ = np.load("data_shenhao/nhts/image_array_rgb_tract_large.npy", mmap_mode='r')
        image_array = image_array_[:size,]
    elif image_type == 'bw':
        image_array_ = np.load("data_shenhao/nhts/image_array_bw_tract_large.npy", mmap_mode='r')
        image_array = image_array_[:size,]        
    elif image_type == 'merge':
        bw_image_array_ = np.load("data_shenhao/nhts/image_array_bw_tract_large.npy", mmap_mode='r')
        rgb_image_array_ = np.load("data_shenhao/nhts/image_array_rgb_tract_large.npy", mmap_mode='r')
        bw_image_array = bw_image_array_[:size,]
        rgb_image_array = rgb_image_array_[:size,]
        image_array = np.concatenate([rgb_image_array, bw_image_array], axis=1)
    
    ### create output array
    df_ = pd.read_csv("data_shenhao/nhts/df_merged_tract_large.csv")
    df = df_.iloc[:size,]
    y_ = df[output_var].values 
    # cut y into categories for discrete variables
    if output_type == 'continuous':
        y = copy.deepcopy(y_)
    elif output_type == 'discrete':
        y = np.array(pd.qcut(y_, q = num_categories, labels=np.arange(num_categories))) 
    x = df[input_var]
    BE = df[BE_var]
            
    ### randomization
    shuffle_idx = np.arange(size)
    np.random.seed(0) # important: don't change the seed number, unless the seed number across scripts are all changed.
    np.random.shuffle(shuffle_idx)
    train_ratio = 0.8

    ###
    # y
    if output_type == 'discrete':
        y_train = y[shuffle_idx[:int(train_ratio*size)]].astype("int")
        y_test = y[shuffle_idx[int(train_ratio*size):]].astype("int")
    elif output_type == 'continuous':
        y_train = y[shuffle_idx[:int(train_ratio*size)]].astype("float32")
        y_test = y[shuffle_idx[int(train_ratio*size):]].astype("float32")
    # BE
    BE_train = BE.values[shuffle_idx[:int(train_ratio*size)]].astype("float32")
    BE_test = BE.values[shuffle_idx[int(train_ratio*size):]].astype("float32")        
    # image array
    x_train_images = image_array[shuffle_idx[:int(train_ratio*size)],].astype("float32")
    x_test_images = image_array[shuffle_idx[int(train_ratio*size):],].astype("float32")
    # NHTS
    x_train = x.values[shuffle_idx[:int(train_ratio*size)]].astype("float32")
    x_test = x.values[shuffle_idx[int(train_ratio*size):]].astype("float32")
    
    return y_train,y_test,BE_train,BE_test,x_train,x_test,x_train_images,x_test_images

# # test 
# image_type = 'bw'
# output_var = 'HHFAMINC_mean'
# output_type = 'continuous'
# input_var=['R_AGE_IMP_mean', 'HHSIZE_mean', 'HHFAMINC_mean', 'HBHTNRNT_mean', 'HBPPOPDN_mean', 'HBRESDN_mean', 
#            'R_SEX_IMP_2_mean', 'EDUC_2_mean', 'HH_RACE_2_mean', 'HOMEOWN_1_mean', 'HOMEOWN_2_mean',
#            'HBHUR_R_mean', 'HBHUR_S_mean', 'HBHUR_T_mean','HBHUR_U_mean']
# BE_var = ['density', 'diversity', 'design']
# num_categories = 1 # (1) certain category values can cause errors. (2) when output_type = 'continuous', this value needs to be 1.
# size = 10000 # size needs to be smaller than the max
# # 
# y_train,y_test,BE_train,BE_test,x_train,x_test,x_train_images,x_test_images = \
#     initialize_data(image_type, output_var, output_type, input_var, BE_var, num_categories, size)


In [7]:
size = 1000
# Important trick: directly reading files from dropbox - Change the default "dl=0" to "dl=1"!
# e.g. initial link: https://www.dropbox.com/s/sxakfluvdol2pwl/df_merged_tract_large.csv?dl=0
df_ = pd.read_csv("https://www.dropbox.com/s/sxakfluvdol2pwl/df_merged_tract_large.csv?dl=1")
df = df_.iloc[:size,]

In [15]:
image_array_ = np.load("https://www.dropbox.com/s/ujssiypek3ur7n4/image_array_bw_tract_large.npy")

FileNotFoundError: [Errno 2] No such file or directory: 'https://www.dropbox.com/s/ujssiypek3ur7n4/image_array_bw_tract_large.npy'

In [2]:
import os

In [3]:
import dropbox

In [4]:
# try dropbox
dbx = dropbox.Dropbox("sl.AdZzc_dGfmP76w5Jo2rILedhehyQLx0wy-tstsCGRgVrS1k-_szPoqVn1EQ0rgsQ178-aNV-pTvk2bViMaB2SfL25dy6Gya_nS6K2Nw1Dy-euKEDmbjKJVhx5lkHUhribdK_yg4")


In [5]:
dbx.users_get_current_account()

FullAccount(account_id='dbid:AACzRDQxkCMv7GDjiddxbH2YEBZZHGPwJi4', name=Name(given_name='shenhao', surname='Wang', familiar_name='shenhao', display_name='shenhao Wang', abbreviated_name='SW'), email='cjsyzwsh@gmail.com', email_verified=True, disabled=False, locale='en', referral_link='https://www.dropbox.com/referrals/AAB8SrgZtaMmkPZZfvPgywyvECi01CO0RmA?src=app9-8130080', is_paired=True, account_type=AccountType('pro', None), root_info=UserRootInfo(root_namespace_id='184231950', home_namespace_id='184231950'), profile_photo_url='https://dl-web.dropbox.com/account_photo/get/dbaphid%3AAAAH0tyx42fNkYkz35TnY8FA8-hrKhzBBms?size=128x128&vers=1510322698555', country='US', team=None, team_member_id=None)

In [6]:
for entry in dbx.files_list_folder('').entries:
    print(entry.name)

ESSAYS
BOOKS
MPlus
Books Temp for 2015
My ESSAYS
People
Camera Uploads
Endnote0
16.0
OneNote_backup
2014_3_fall
2014_4_winter
2015_1_Spring
2015_2_Summer
2015_3_Fall
2015_4_Winter
2016_1_Spring
2016_2_Summer
2016_3_Fall
2016_4_Winter
2018_3_fall
2018_3_fall_recording
0_official_doc_shenhao
0_summary_reading_class_topics
2019_1_spring
Pictures_Personal
Pictures_WorkAndProject
2018_1_Spring
2019_3_fall
2019_3_fall_records
2020_1_spring
3_career
9_travel_process
5_books_dailybooks
5_books_textbook
3_teaching
2_personal
4_networking_connections
0_datasets
3_paper_reviews
0_English
Ventilator_certificates_nyc
2020_2_summer
3_project_0_archive
2017_3_fall_records
2017_3_fall
2017_1_spring
2017_1_Spring_Records
2018_1_spring_records
2014_3_fall_Records
2014_4_winter_records
2015_3_Fall_Records
2015_1_Spring_records
2015_2_Summer_records
2019_1_spring_records
coding_latex
coding_general
coding_R
2015 English
2017_2_Summer
2014 English
2017_2_Summer Records
2016_3_Fall_Records
2016_1_Spring_Rec

In [None]:
dbx.files_download_to_file()

In [19]:
os.chdir("https://www.dropbox.com/sh/37ctvcpqpnh0pfp/AABBxCo0rXfQjFqhiFJpzPNia?")

FileNotFoundError: [Errno 2] No such file or directory: 'https://www.dropbox.com/sh/37ctvcpqpnh0pfp/AABBxCo0rXfQjFqhiFJpzPNia?'

In [21]:
import pickle
a = {"tmp": df}

with open('tmp.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
"https://www.dropbox.com/s/ujssiypek3ur7n4/image_array_bw_tract_large.npy?dl=0"

In [14]:
pd.read_table("https://www.dropbox.com/s/ujssiypek3ur7n4/image_array_bw_tract_large.npy?dl=1")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x93 in position 0: invalid start byte

In [12]:
# test 
image_type = 'bw'
output_var = 'HHFAMINC_mean'
output_type = 'continuous'
input_var=['R_AGE_IMP_mean', 'HHSIZE_mean', 'HHFAMINC_mean', 'HBHTNRNT_mean', 'HBPPOPDN_mean', 'HBRESDN_mean', 
           'R_SEX_IMP_2_mean', 'EDUC_2_mean', 'HH_RACE_2_mean', 'HOMEOWN_1_mean', 'HOMEOWN_2_mean',
           'HBHUR_R_mean', 'HBHUR_S_mean', 'HBHUR_T_mean','HBHUR_U_mean']
BE_var = ['density', 'diversity', 'design']
num_categories = 1 # (1) certain category values can cause errors. (2) when output_type = 'continuous', this value needs to be 1.
size = 3000 # size needs to be smaller than the max
# 
y_train,y_test,BE_train,BE_test,x_train,x_test,x_train_images,x_test_images = \
    initialize_data(image_type, output_var, output_type, input_var, BE_var, num_categories, size)


KeyError: "['HBHTNRNT_mean', 'HBRESDN_mean', 'R_AGE_IMP_mean', 'HBPPOPDN_mean'] not in index"

In [None]:
# read data; It takes 20 minutes without mmap_mode(!!!).
# this mmap_mode is super useful.
x_train_images_ = np.load("data_shenhao/nhts/image_array_rgb_tract_large.npy", mmap_mode = 'r')


# x_train_images_ = np.load("data_shenhao/nhts/x_train_rgb_tract_large.npy", mmap_mode = 'r')
# x_test_images_ = np.load("data_shenhao/nhts/x_test_rgb_tract_large.npy", mmap_mode = 'r')

# x_train_images_ = np.load("data_shenhao/nhts/x_train_bw_images.npy", mmap_mode = 'r')
# x_test_images_ = np.load("data_shenhao/nhts/x_test_bw_images.npy", mmap_mode = 'r')

In [None]:
n = x_train_images_.shape[0]

In [None]:
print(n)

In [None]:
# read y.
y_train_ = np.load("data_shenhao/nhts/y_train.npy")
y_test_ = np.load("data_shenhao/nhts/y_test.npy")

In [None]:
# print("The sample size of training set is: ", x_train_nhts.shape[0])
# print("The sample size of testing set is: ", x_test_nhts.shape[0])
print("Training image shape: ", x_train_images_.shape)
print("Testing image shape: ", x_test_images_.shape)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Data Preparation 

In [None]:
# you need to use only a subset here. The full set is REALLY SLOW for debugging...
size = 2000
x_train_images = x_train_images_[:size, :, :, :]
x_test_images = x_test_images_[:size, :, :, :]
y_train = y_train_[:size]
y_test = y_test_[:size]

In [None]:
y_train[y_train > 2] = 3
y_test[y_test > 2] = 3

In [None]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')
print(np.unique(y_train, return_counts=True)[1]/len(y_train))
print(np.unique(y_test, return_counts=True)[1]/len(y_test))

## Transform & Visualization
Two ways to transform. \
1) Potentially use the torchvision.transforms. However, then you have to iteratre over each image with PIL Image. It can be very slow. 
2) Customize the transformation by yourself in numpy...


In [None]:
# specify data type. 
x_train_images=x_train_images.astype("float32")
x_test_images=x_test_images.astype("float32")
y_train=y_train.astype("int")
y_test=y_test.astype("int")

In [None]:
# transform.
# normalize to [0,1]
x_train_images_norm = x_train_images/255
x_test_images_norm = x_test_images/255

# to torch
x_train_torch = torch.from_numpy(x_train_images_norm)
x_test_torch = torch.from_numpy(x_test_images_norm)
y_train_torch = torch.from_numpy(y_train)
y_test_torch = torch.from_numpy(y_test)

print(x_train_torch.size())
print(x_test_torch.size())
print(y_train_torch.size())
print(y_test_torch.size())

In [None]:
# use data loader: train and test. 
train_ds = TensorDataset(x_train_torch, y_train_torch)
batch_size = 200
train_dl = DataLoader(train_ds, batch_size, shuffle = True)

test_ds = TensorDataset(x_test_torch, y_test_torch)
batch_size = 200
test_dl = DataLoader(test_ds, batch_size, shuffle = True)

In [None]:
# visualize with torchvision.
images_, labels_ = iter(train_dl).next()
images = images_[:5,] # visualize five images
labels = labels_[:5] # visualize five images

def imshow(img):
#     img = img * 255.0     # unnormalize
    npimg = img.numpy()
    plt.figure(figsize = (15,3))
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()
    
imshow(torchvision.utils.make_grid(images))
print(labels.numpy())

In [None]:
# 1. Use naive CNN model
class Net(nn.Module):
    # sw: again. It is critical to understand the dimension transformation.
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5) # sw: change the input channel for data set.
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 47 * 47, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 4)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 47 * 47)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# initialize
net = Net().float().to(device)

# # test
# images, labels = iter(train_dl).next()
# output = net(images)
# print(output.size())

In [None]:
# 2. Use ResNet.
model_ft = models.resnet18(pretrained=False)
for param in model_ft.parameters():
    param.requires_grad = True # Shenhao might change it to True. Train the full set.

# The following line allows us to edit the input channels.
# model_ft.conv1 = nn.Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)    
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 4)
net = model_ft.to(device)

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
n_epoch = 100

In [None]:
for epoch in range(n_epoch):# loop over the dataset multiple times
    # sw: learn the way of printing out the total loss for each batch.
    running_loss_train = 0.0
    running_loss_test = 0.0
    correct_train = 0
    total_train = 0
    correct_test = 0
    total_test = 0
    
    # training    
    for inputs, labels in train_dl:
        # to device
        inputs = inputs.to(device)
        labels = labels.to(device)

        # forward + backward
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        # evaluate prediction
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        # optimize
        optimizer.step()
        optimizer.zero_grad()
        
        # statistics
        running_loss_train += loss.item()
        
    # testing
    for inputs, labels in test_dl:
        # to device
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        running_loss_test += loss.item()
        
        # evaluate prediction
        _, predicted = torch.max(outputs.data, 1)
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()
        
    print("Epoch {}: Training Loss {}; Testing Loss {}".format(epoch, running_loss_train, running_loss_test))
    print("Epoch {}: Training Accuracy {}; Testing Accuracy {}".format(epoch, correct_train/total_train, correct_test/total_test))

#         if i % 2000 == 1999:    # print every 2000 mini-batches
#             print('[%d, %5d] loss: %.3f' %
#                   (epoch + 1, i + 1, running_loss / 2000))
#             running_loss = 0.0
# print('Finished Training')