<a href="https://colab.research.google.com/github/donghuna/AI-Expert/blob/main/timesformer/TimeSformer-huggingface-example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install av

Collecting av
  Downloading av-12.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-12.1.0


In [13]:
import json
import os
import random
from ftplib import FTP
import io
import numpy as np
import av
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.utils import save_image
from torchvision import transforms
from transformers import TimesformerForVideoClassification, get_linear_schedule_with_warmup, AdamW

# FTP 서버 정보
ftp_server = "121.136.96.223"
ftp_port = 21
ftp_user = "donghuna_ftp"
ftp_password = "Dlehdgns0892!@!?n"
folder_path = "homes/donghuna/database/Diving48_rgb/rgb/"

# FTP 연결 설정
ftp = FTP()
ftp.connect(ftp_server, ftp_port)
ftp.login(user=ftp_user, passwd=ftp_password)
ftp.set_pasv(True)

# 동영상 데이터셋 경로
train_json_path = "Diving48_V2_train.json"
test_json_path = "Diving48_V2_test.json"

with open(train_json_path, 'wb') as local_file:
    ftp.retrbinary(f'RETR {"homes/donghuna/database/Diving48_rgb/Diving48_V2_train.json"}', local_file.write)

with open(test_json_path, 'wb') as local_file:
    ftp.retrbinary(f'RETR {"homes/donghuna/database/Diving48_rgb/Diving48_V2_test.json"}', local_file.write)

# 동영상 데이터를 읽어오기 위한 함수
def read_video_from_ftp(ftp, file_path, start_frame, end_frame):
    video_data = io.BytesIO()
    ftp.retrbinary(f'RETR {file_path}', video_data.write)
    video_data.seek(0)
    container = av.open(video_data, format='mp4')
    frames = []
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_frame:
            break
        if i >= start_frame:
            frame_np = frame.to_ndarray(format="rgb24")
            frames.append(frame_np.astype(np.uint8))
    return np.stack(frames, axis=0)

def sample_frames(frames, num_frames):
    total_frames = len(frames)
    sampled_frames = list(frames)
    if total_frames <= num_frames:
        # sampled_frames = frames
        if total_frames < num_frames:
            padding = [np.zeros_like(frames[0]) for _ in range(num_frames - total_frames)]
            sampled_frames.extend(padding)
            # sampled_frames = np.concatenate([sampled_frames, padding], axis=0)
    else:
        indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
        sampled_frames = [frames[i] for i in indices]

    return np.array(sampled_frames)

# 변환 함수 정의
def pad_and_resize(frames, target_size):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(target_size),
        transforms.ToTensor()
    ])
    processed_frames = [transform(frame) for frame in frames]

    return torch.stack(processed_frames)

def read_and_process_video(ftp, file_path, start_frame, end_frame, target_size, num_frames):
    frames = read_video_from_ftp(ftp, file_path, start_frame, end_frame)
    frames = sample_frames(frames, num_frames=num_frames)
    processed_frames = pad_and_resize(frames, target_size=target_size)
    processed_frames = processed_frames.permute(1, 0, 2, 3)  # (T, C, H, W) -> (C, T, H, W)

    return processed_frames

# Diving48 데이터셋 클래스 정의
class Diving48Dataset(Dataset):
    def __init__(self, json_path, ftp, folder_path, target_size=(224, 224), num_frames=48):
        with open(json_path, 'r') as f:
            self.data = json.load(f)
        self.ftp = ftp
        self.folder_path = folder_path
        self.target_size = target_size
        self.num_frames = num_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        vid_info = self.data[idx]
        vid_name = vid_info['vid_name']
        label = vid_info['label']
        start_frame = vid_info['start_frame']
        end_frame = vid_info['end_frame']
        file_path = os.path.join(self.folder_path, f"{vid_name}.mp4")

        video = read_and_process_video(self.ftp, file_path, start_frame, end_frame, target_size=self.target_size, num_frames=self.num_frames)

        return video, label

# 모델 로드
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")

# 데이터셋 및 데이터로더 생성
full_train_dataset = Diving48Dataset(train_json_path, ftp, folder_path)

# Train-validation split
train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

test_dataset = Diving48Dataset(test_json_path, ftp, folder_path)


def collate_fn(batch):
    videos, labels = zip(*batch)
    videos = torch.stack(videos)
    videos = videos.permute(0, 2, 1, 3, 4)  # (B, T, C, H, W) -> (B, C, T, H, W)
    labels = torch.tensor(labels)
    return videos, labels

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)



In [3]:
from google.colab import drive
drive.mount('/content/drive')

# # 모델 저장
# # torch.save(model.state_dict(), '/content/drive/MyDrive/timesformer_weight/model_epoch_{epoch+1}.pt')

# # 변환된 동영상 저장 함수
# def save_transformed_video(video_tensor, filename):
#     # (C, T, H, W) -> (T, C, H, W)
#     # video_tensor = video_tensor.permute(1, 0, 2, 3)
#     for i, frame in enumerate(video_tensor):
#         save_image(frame, f"{filename}_frame_{i}.png")

# # 변환된 동영상 저장 (테스트 데이터셋의 첫 번째 비디오)
# video, label = train_dataset[1]
# save_transformed_video(video, '/content/drive/MyDrive/transfomed_video/transformed_video')

# ftp.quit()

Mounted at /content/drive


In [15]:
# 학습 루프 구현
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# 손실 함수 정의
loss_fn = torch.nn.CrossEntropyLoss()

# 학습 루프
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0

    loop_time = 0

    for batch in train_loader:
        videos, labels = batch
        videos = videos.to(device)
        labels = labels.to(device)

        outputs = model(videos)
        loss = loss_fn(outputs.logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # 학습 정확도 계산
        _, predicted = torch.max(outputs.logits, 1)
        correct_train += (predicted == labels).sum().item()
        total_train += labels.size(0)

        loop_time += 1
        if loop_time % 10 == 0:
            break

    avg_loss = total_loss / len(train_loader)
    train_accuracy = correct_train / total_train
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")


    # validation 수행
    model.eval()
    correct_val = 0
    total_val = 0
    val_loss = 0

    validate_dim_lengthloop_time = 0

    with torch.no_grad():
        for batch in val_loader:
            videos, labels = batch
            videos = videos.to(device)
            labels = labels.to(device)

            outputs = model(videos)
            loss = loss_fn(outputs.logits, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            correct_val += (predicted == labels).sum().item()
            total_val += labels.size(0)

            validate_dim_lengthloop_time += 1
            if validate_dim_lengthloop_time % 10 == 0:
                break

    val_loss /= len(val_loader)
    val_accuracy = correct_val / total_val
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # 모델 파라미터 저장
    torch.save(model.state_dict(), f'/content/drive/MyDrive/timesformer_weight/model_epoch_{epoch+1}.pt')


# 평가 루프
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in test_loader:
        videos, labels = batch
        videos = videos.to(device)
        labels = labels.to(device)

        outputs = model(videos)
        _, predicted = torch.max(outputs.logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy:.4f}")

# 연결 종료
ftp.quit()




Epoch [1/5], Loss: 0.0059, Train Accuracy: 0.0000
Validation Loss: 0.0230, Validation Accuracy: 0.0000
Epoch [2/5], Loss: 0.0051, Train Accuracy: 0.1000
Validation Loss: 0.0181, Validation Accuracy: 0.1000
Epoch [3/5], Loss: 0.0049, Train Accuracy: 0.1000
Validation Loss: 0.0168, Validation Accuracy: 0.2000
Epoch [4/5], Loss: 0.0047, Train Accuracy: 0.0000
Validation Loss: 0.0165, Validation Accuracy: 0.1000
Epoch [5/5], Loss: 0.0044, Train Accuracy: 0.0000
Validation Loss: 0.0157, Validation Accuracy: 0.0000


KeyboardInterrupt: 

In [None]:
!ls