# Predictive coding framework 
##Including sequence modeling, action environment interaction and mutiagent language learning


In [83]:
import torch
import numpy as np
from torch.functional import F
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torchvision import transforms
import torchvision.datasets as dst
from torchvision.utils import save_image
import pandas as pd

img_x=240
img_y=320

def conv2D_output_size(img_size, padding, kernel_size, stride):
    # compute output shape of conv2D
    outshape = (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int),
                np.floor((img_size[1] + 2 * padding[1] - (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int))
    return outshape

#二维卷积
class EncoderCNN(nn.Module):
    def __init__(self, img_x=img_x, img_y=img_y, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        super(EncoderCNN, self).__init__()

        self.img_x = img_x
        self.img_y = img_y
        self.CNN_embed_dim = CNN_embed_dim

        # CNN architechtures
        self.ch1, self.ch2, self.ch3, self.ch4 = 32, 64, 128, 256
        self.k1, self.k2, self.k3, self.k4 = (5, 5), (3, 3), (3, 3), (3, 3)      # 2d kernal size
        self.s1, self.s2, self.s3, self.s4 = (2, 2), (2, 2), (2, 2), (2, 2)      # 2d strides
        self.pd1, self.pd2, self.pd3, self.pd4 = (0, 0), (0, 0), (0, 0), (0, 0)  # 2d padding

        # conv2D output shapes
        self.conv1_outshape = conv2D_output_size((self.img_x, self.img_y), self.pd1, self.k1, self.s1)  # Conv1 output shape
        self.conv2_outshape = conv2D_output_size(self.conv1_outshape, self.pd2, self.k2, self.s2)
        self.conv3_outshape = conv2D_output_size(self.conv2_outshape, self.pd3, self.k3, self.s3)
        self.conv4_outshape = conv2D_output_size(self.conv3_outshape, self.pd4, self.k4, self.s4)

        # fully connected layer hidden nodes
        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1, padding=self.pd1),
            nn.BatchNorm2d(self.ch1, momentum=0.01),
            nn.ReLU(inplace=True),                      
            # nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2, padding=self.pd2),
            nn.BatchNorm2d(self.ch2, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, stride=self.s3, padding=self.pd3),
            nn.BatchNorm2d(self.ch3, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch3, out_channels=self.ch4, kernel_size=self.k4, stride=self.s4, padding=self.pd4),
            nn.BatchNorm2d(self.ch4, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.drop = nn.Dropout2d(self.drop_p)
        self.pool = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(self.ch4 * self.conv4_outshape[0] * self.conv4_outshape[1], self.fc_hidden1)   # fully connected layer, output k classes
        self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
        self.fc3 = nn.Linear(self.fc_hidden2, self.CNN_embed_dim)   # output = CNN embedding latent variables

    def forward(self, x):
        
            # CNNs
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = x.view(x.size(0), -1)           # flatten the output of conv

        # FC layers
        x = F.relu(self.fc1(x))
        # x = F.dropout(x, p=self.drop_p, training=self.training)
        x = F.relu(self.fc2(x))
        x = F.dropout(x, p=self.drop_p, training=self.training)
        out = self.fc3(x)
            
        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return out

#class: sequence coding module
#惰性编码单元
class inertiaCodingCell(nn.Module):
    '''
    input:(batch,seq_len,channel,witdth,height)
    output:(batch,code_len,code_dim)
    
    '''
    def __init__(self,n_channel=1,width=img_x,height=img_y):
        super(inertiaCodingCell,self).__init__()
        self.encoder_l=EncoderCNN(CNN_embed_dim=1024)
        self.d_f1=nn.Linear(in_features=1024,out_features=1024)
        self.d_fE=nn.Linear(in_features=1024,out_features=256)
        self.d_fS=nn.Linear(in_features=1024,out_features=256)
        
        self.pred_f1=nn.Linear(in_features=512,out_features=512)
        self.pred_f2=nn.Linear(in_features=512,out_features=256)
        
        self.decoder_fl=nn.Linear(in_features=512,out_features=56*75*3)
        self.decoder_f2=nn.ConvTranspose2d(in_channels=3,out_channels=64,kernel_size=(7,7),stride=2)
        self.decoder_f3=nn.ConvTranspose2d(in_channels=64,out_channels=3,kernel_size=(8,12),stride=2)

                
    def encoder(self,x):
        h=self.encoder_l(x)
        h=self.d_f1(h)
        h=F.relu(h)
        E=self.d_fE(h)
        E=F.relu(E)
        S=self.d_fS(h)
        S=F.relu(S)
        return E,S
    
    def predictor(self,E,S):
        temp=torch.cat([E,S],1)
        temp=self.pred_f1(temp)
        temp=F.relu(temp)
        out=self.pred_f2(temp)
        out=F.relu(out)
        return(out)

    def decoder(self,E,S):
        temp=torch.cat([E,S],1)
        temp=self.decoder_fl(temp)
        temp=temp.view(-1,3,56,75)
        temp=self.decoder_f2(temp)
        x_rec=self.decoder_f3(temp)
        return(x_rec)
        
    def forward(self,x,pre_E=None,pred_S=None):
        
        if (pre_E is None) or (pred_S is None):
            
            x_in=x
            E_plus,S_plus=self.encoder(x_in)
            E=E_plus
            S=S_plus
            x_rec=self.decoder(E,S)
            S_pred=self.predictor(E,S)
            
        else:
            
            x_pred_rec=self.decoder(pre_E,pred_S)
            x_in=x-x_pred_rec
            E_plus,S_plus=self.encoder(x_in)
            E=pre_E+E_plus
            S=pred_S+S_plus
            x_rec=self.decoder(E,S)
            S_pred=self.predictor(E,S)
        
        return(E,S,S_pred,E_plus,x_rec)
    
#惰性编码预测网络
class inertiaCodingNet(nn.Module):
    def __init__(self,seq_len=10,n_channel=3,w=img_x,h=img_y):
        super(inertiaCodingNet,self).__init__()
        self.seq_len=seq_len
        self.n_channel=n_channel
        self.w=w
        self.h=h
        self.inertiaCell=inertiaCodingCell()
        
        
    
    #input (batch_size,seq_len,n_channel,w,h)
    def forward(self,x):
        b,s_len,c_len,w,h=x.shape
        E=[]
        S=[]
        S_pred=[]
        E_plus=[]
        x_rec=[]
        for i in range(s_len):
            x_t=x[:,i,:,:,:]
            
            if i==0:
                E_t,S_t,S_pred_t,E_plus_t,x_rec_t=self.inertiaCell(x_t)
            else:
                E_t,S_t,S_pred_t,E_plus_t,x_rec_t=self.inertiaCell(x_t,pre_E=E_t,pred_S=S_pred_t)
            
            E.append(E_t)
            S.append(S_t)
            S_pred.append(S_pred_t)
            E_plus.append(E_plus_t)
            x_rec.append(x_rec_t)
            
        E=torch.stack(E).transpose(0,1)
        S=torch.stack(S).transpose(0,1)
        E_plus=torch.stack(E_plus).transpose(0,1)
        x_rec=torch.stack(x_rec).transpose(0,1)
            
        #(batch_size,seq_len,hidden_dim)
        return(E,S,E_plus,x_rec)
        
        
#方差损失

def var_loss(tensor):
    temp=tensor
    means=torch.mean(temp,axis=1).view(tensor.shape[0],1,-1)
    return torch.sum(torch.pow(temp-means,2)) 
        
        
        
        
        
        

        
        
        
        
        
        

In [14]:
#dataset
import os
import numpy as np
from PIL import Image
from torch.utils import data
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm

## ------------------- label conversion tools ------------------ ##
def labels2cat(label_encoder, list):
    return label_encoder.transform(list)

def labels2onehot(OneHotEncoder, label_encoder, list):
    return OneHotEncoder.transform(label_encoder.transform(list).reshape(-1, 1)).toarray()

def onehot2labels(label_encoder, y_onehot):
    return label_encoder.inverse_transform(np.where(y_onehot == 1)[1]).tolist()

def cat2labels(label_encoder, y_cat):
    return label_encoder.inverse_transform(y_cat).tolist()


class Dataset_CRNN(data.Dataset):
    "Characterizes a dataset for PyTorch"
    def __init__(self, data_path, frames,labels=None,transform=None,load_all=False):
        "Initialization"
        self.data_path = data_path
        #self.labels = labels
        folders=[]
        for f in os.listdir(data_path):
            if f!='.DS_Store':
                folders.extend(list(map(lambda x:f+'/'+x,os.listdir(data_path+f))))
                
        for i in range(len(folders)-1,-1,-1):
            if '.DS_Store' in folders[i]:
                folders.pop(i)
        
        self.folders = folders
        self.load_all=load_all
        self.transform = transform
        self.frames = frames
        
        if self.load_all:
            temp=[]
            print('loading all images')
            for f in self.folders:
                temp.append(self.read_images(self.data_path, f, self.transform) )
            self.dataset=temp
        
        

    def __len__(self):
        "Denotes the total number of samples"
        return len(self.folders)

    def read_images(self, path, selected_folder, use_transform):
        
        X = []
        for i in self.frames:
            image = Image.open(os.path.join(path, selected_folder, 'frame'+str(i)+'.jpg'))

            if use_transform is not None:
                image = use_transform(image)

            X.append(image)
        X = torch.stack(X, dim=0)

        return X


    def __getitem__(self, index):
        "Generates one sample of data"
        # Select sample
        if not self.load_all:
            folder = self.folders[index]

        # Load data
            X = self.read_images(self.data_path, folder, self.transform)     # (input) spatial images
        #y = torch.LongTensor([self.labels[index]])                  # (labels) LongTensor are for int64 instead of FloatTensor
        else:
            X=self.dataset[index]
        return X

    


In [None]:
%%time
d_path='/Users/lekang/anaconda/tests/Review/Torch/predictiveCoding/ucf101-jpg/'

transform = transforms.Compose([transforms.Resize([240, 320]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])


datatest=Dataset_CRNN(data_path=d_path,
                      frames=list(range(1,21)),
                      transform=transform,
                      load_all=True
                     )

use_cuda=False
batch_size=20

all_data_params = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 12}#, 'pin_memory': True}# if use_cuda else {}
all_data_loader = data.DataLoader(datatest, 
                                  **all_data_params)


loading all images


In [None]:
for i,datai in enumerate(all_data_loader):
    print(datai.shape)


In [97]:
%%time

code_net=inertiaCodingNet()#.to_device(device)

a=torch.tensor(np.random.rand(10*240*320*3*10).reshape(10,10,3,240,320),dtype=torch.float32)#.to_device(device)
E,H,E_plus,x_rec=code_net(a)
print(E.shape)
print(H.shape)
print(E_plus.shape)
print(x_rec.shape)

torch.Size([10, 10, 256])
torch.Size([10, 10, 256])
torch.Size([10, 10, 256])
torch.Size([10, 10, 3, 240, 320])
CPU times: user 39.5 s, sys: 1.29 s, total: 40.8 s
Wall time: 7.85 s


In [29]:
import os
import numpy as np
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import matplotlib.pyplot as plt
from functions import *
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import pickle

# set path
data_path = "ucf101-jpg/"    # define UCF-101 RGB data path
action_name_path = "./UCF101actions.pkl"
save_model_path = "./checkpoints/"

# use same encoder CNN saved!
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768
CNN_embed_dim = 512   # latent dim extracted by 2D CNN
img_x, img_y = 256, 342  # resize video 2d frame size
dropout_p = 0.0       # dropout probability

# use same decoder RNN saved!
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 256

# training parameters
k = 101             # number of target category
batch_size = 40
# Select which frame to begin & end in videos
begin_frame, end_frame, skip_frame = 1, 29, 1


with open(action_name_path, 'rb') as f:
    action_names = pickle.load(f)   # load UCF101 actions names

# convert labels -> category
le = LabelEncoder()
le.fit(action_names)

# show how many classes there are
list(le.classes_)

# convert category -> 1-hot
action_category = le.transform(action_names).reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(action_category)

# # example
# y = ['HorseRace', 'YoYo', 'WalkingWithDog']
# y_onehot = labels2onehot(enc, le, y)
# y2 = onehot2labels(le, y_onehot)

actions = []
fnames = os.listdir(data_path)

all_names = []
for f in fnames:
    loc1 = f.find('v_')
    loc2 = f.find('_g')
    actions.append(f[(loc1 + 2): loc2])

    all_names.append(f)


# list all data files
all_X_list = all_names              # all video file names
all_y_list = labels2cat(le, actions)    # all video labels

# data loading parameters
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4, 'pin_memory': True} if use_cuda else {}

transform = transforms.Compose([transforms.Resize([img_x, img_y]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()

# reset data loader
all_data_params = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 4, 'pin_memory': True} if use_cuda else {}
all_data_loader = data.DataLoader(Dataset_CRNN(data_path, all_X_list, all_y_list, selected_frames, transform=transform), **all_data_params)


tensor(434.7973, grad_fn=<SumBackward0>)

In [None]:
import imageio
import torchvision.transforms as transforms
from torchvision.utils import save_image
from pathos.multiprocessing import ProcessingPool as P
import os
import scipy
import scipy.misc


#Image.open(os.path.join(path, selected_folder, 'frame{:06d}.jpg'.format(i))).convert('L')

def convert_v2imag(inpath,outpath):
    files=os.listdir(inpath)
    for f in files:
        
        if not os.path.isdir(outpath):
            os.mkdir(outpath)
        
        outfolder=outpath+'/'+f.split('.')[0]
        os.chdir(outpath)
        
        os.mkdir(outfolder)
        file=imageio.get_reader(inpath+'/'+f, "ffmpeg")
        print(f)
        for i in range(len(list(file))):
            im_array=list(file)[i]
            scipy.misc.imsave(outfolder+'/'+'frame'+str(i)+'.jpg', im_array)

inpath_header='/Users/lekang/anaconda/tests/Review/Torch/predictiveCoding/UCF-101/'
outpath_header='/Users/lekang/anaconda/tests/Review/Torch/predictiveCoding/ucf101-jpg/'

def convert_folder(inpath_h,outpath_h,threads=6):
    
    p=P(threads)
    
    folders=os.listdir(inpath_h)
    
    p.map(lambda x:convert_v2imag(inpath_h+'/'+x,outpath_h+'/'+x),folders)
    
    


convert_folder(inpath_h='/Users/lekang/anaconda/tests/Review/Torch/predictiveCoding/UCF-101/',
               outpath_h='/Users/lekang/anaconda/tests/Review/Torch/predictiveCoding/ucf101-jpg/',threads=12)
        


In [16]:
from PIL import Image
import numpy as np
import torch
#x=np.stack(list(vd)[:100],0).transpose(0,3,1,2).reshape(10,10,3,240,-1)
#x=torch.tensor(x,dtype=torch.float32)
im1=Image.open('/Users/lekang/anaconda/tests/Review/Torch/predictiveCoding/ucf101-jpg/BlowDryHair/v_BlowDryHair_g01_c03/frame3.jpg')
transform = transforms.Compose([transforms.Resize([240, 320]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
#transforms.ToTensor()ba
transform(im1).shape
save_image(transform(im1),'/Users/lekang/Desktop/tewsting.jpg')

In [135]:
from torchvision.utils import save_image
import scipy.misc

scipy.misc.imsave('/Users/lekang/Desktop/testingsave2.jpg', np.array(list(vd)[60]),)

image_output = Image.fromarray(np.array(list(vd)[60].transpose(2,0,1)[0,:,:]))
image_output.save('/Users/lekang/Desktop/testingsave1.jpg')

save_image(torch.tensor(list(vd)[60].transpose(2,0,1),dtype=torch.float32),'/Users/lekang/Desktop/testingsave.jpg')

`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.
  after removing the cwd from sys.path.


In [94]:
%%time
E,H,E_plus,x_rec=code_net(x)


CPU times: user 39.6 s, sys: 673 ms, total: 40.3 s
Wall time: 6.86 s


In [122]:
torch.to

torch.Size([3, 240, 320])

In [113]:

class Dataset_CRNN(data.Dataset):
    "Characterizes a dataset for PyTorch"
    def __init__(self, data_path, folders, labels, frames, transform=None):
        "Initialization"
        self.data_path = data_path
        self.labels = labels
        self.folders = folders
        self.transform = transform
        self.frames = frames

    def __len__(self):
        "Denotes the total number of samples"
        return len(self.folders)

    def read_images(self, path, selected_folder, use_transform):
        X = []
        for i in self.frames:
            image = Image.open(os.path.join(path, selected_folder, 'frame{:06d}.jpg'.format(i)))

            if use_transform is not None:
                image = use_transform(image)

            X.append(image)
        X = torch.stack(X, dim=0)

        return X

    def __getitem__(self, index):
        "Generates one sample of data"
        # Select sample
        folder = self.folders[index]

        # Load data
        X = self.read_images(self.data_path, folder, self.transform)     # (input) spatial images
        y = torch.LongTensor([self.labels[index]])                  # (labels) LongTensor are for int64 instead of FloatTensor

        # print(X.shape)
        return X, y
    
d1=Dataset_CRNN(data_path='/Users/lekang/Downloads/UCF101 - Action Recognition Data Set/UCF-101/',
                folders=['Archery'],
                labels=['Archery'],
                frames=range(29),
                
               )
d1[0]


FileNotFoundError: [Errno 2] No such file or directory: '/Users/lekang/Downloads/UCF101 - Action Recognition Data Set/UCF-101/Archery/frame000000.jpg'

'frame000003.jpg'