# 구글 드라이브 마운트

In [1]:
from google.colab import drive

# 구글 드라이브와 연결
drive.mount('/content/drive')

mount_directory = "drive/MyDrive"

Mounted at /content/drive


# 기본 라이브러리 import

In [2]:
import pandas as pd
import numpy as np
import requests
import re
import os

from tqdm import tqdm

# 데이터셋 불러오기(MovieLens)

In [3]:
# 데이터 크기 설정 ex)latest-small, 32m, ...
dataset_volume = "32m"

# CSV 데이터 로드
df_ratings = pd.read_csv(f"{mount_directory}/ml-{dataset_volume}/ratings.csv")
df_movies = pd.read_csv(f"{mount_directory}/ml-{dataset_volume}/movies.csv")
df_tags = pd.read_csv(f"{mount_directory}/ml-{dataset_volume}/tags.csv")

In [4]:
# Dataset의 User, Movie 수 확인
n_users = df_ratings.userId.unique().shape[0]
n_items = df_ratings.movieId.unique().shape[0]
print("num users: {}, num items:{}".format(n_users, n_items))

num users: 200948, num items:84432


In [5]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama
87581,292737,Shelter in Solitude (2023),Comedy|Drama
87582,292753,Orca (2023),Drama
87583,292755,The Angry Breed (1968),Drama


# 데이터 전처리

In [6]:
# ratings의 timestamp 제거
df_ratings.drop(['timestamp'], axis=1, inplace=True)

In [7]:
# 데이터 전처리
# user id, movie id의 범위를 (0 ~ 사용자 수 -1), (0 ~ 영화 수 -1) 사이로 맞춰줌.

from scipy.sparse import coo_matrix

# user_id와 movie_id를 고유 인덱스로 매핑
user_dict = {user_id: idx for idx, user_id in enumerate(df_ratings['userId'].unique())}
movie_dict = {movie_id: idx for idx, movie_id in enumerate(df_ratings['movieId'].unique())}

# user_idx와 movie_idx로 새로운 컬럼 추가
df_ratings['user_idx'] = df_ratings['userId'].map(user_dict)
df_ratings['movie_idx'] = df_ratings['movieId'].map(movie_dict)

# 희소 행렬 생성
ratings_sparse = coo_matrix(
    (df_ratings['rating'], (df_ratings['user_idx'], df_ratings['movie_idx'])),
    shape=(len(user_dict), len(movie_dict))
)

# 역 매핑 딕셔너리 생성
user_idx_to_id = {v: k for k, v in user_dict.items()}
movie_idx_to_id = {v: k for k, v in movie_dict.items()}

In [8]:
# movie의 index로 부터 title과 genre 값으로 변환을 위한 dict 생성
movie_idx_to_name=dict()
movie_idx_to_genre=dict()
for row in df_movies.itertuples(index=False):
    movie_id, movie_name, movie_genre = row
    if movie_id not in movie_dict:              # 어떤 영화가 rating data에 없는 경우 skip
        continue
    movie_idx_to_name[movie_dict[movie_id]] = movie_name
    movie_idx_to_genre[movie_dict[movie_id]] = movie_genre

df_movies['genres'] = df_movies['genres'].apply(lambda x : x.split('|')).apply(lambda x : " ".join(x))

In [9]:
# 마지막 괄호 안의 연도를 추출하는 함수
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)  # 문자열 끝에 있는 괄호 안의 숫자 찾기
    return int(match.group(1)) if match else None

In [10]:
# title에서 제목과 년도를 분리
df_movies['year'] = df_movies['title'].apply(extract_year)

# title에서 "( )"를 포함한 부연 설명 부분 제거
df_movies['title'] = df_movies['title'].str.split(" \(").str.get(0)

In [11]:
df_movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995.0
1,2,Jumanji,Adventure Children Fantasy,1995.0
2,3,Grumpier Old Men,Comedy Romance,1995.0
3,4,Waiting to Exhale,Comedy Drama Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0
...,...,...,...,...
87580,292731,The Monroy Affaire,Drama,2022.0
87581,292737,Shelter in Solitude,Comedy Drama,2023.0
87582,292753,Orca,Drama,2023.0
87583,292755,The Angry Breed,Drama,1968.0


# TMDB api를 통한 웹크롤링

감독, 배우 3명의 정보를 TMDB에 대한 api 요청을 통해 크롤링하여 Dataframe에 저장

이를 위해선 TMDB 홈페이지에 회원가입 및 api key를 할당 받아야 한다.

https://www.themoviedb.org/settings/api

In [12]:
# TMDb API 키 설정
API_KEY = "f2e3dc4096954042cfd1c618f01883ce"
BASE_URL = "https://api.themoviedb.org/3"

In [13]:
# 영화 제목으로 검색하여 감독과 배우 정보를 가져오는 함수
def get_movie_credits(title, year):
    # 1. 영화 ID 검색
    search_url = f"{BASE_URL}/search/movie"
    params = {"api_key": API_KEY, "query": title}
    if not np.isnan(year): # year 값이 없으면 title로만 요청
        params["year"] = int(year)

    response = requests.get(search_url, params=params)
    if response.status_code != 200 or not response.json()['results']:
        return None, None  # 데이터가 없으면 None 반환

    movie_id = response.json()['results'][0]['id']

    # 2. 영화 크레딧(감독/배우) 가져오기
    credits_url = f"{BASE_URL}/movie/{movie_id}/credits"
    credits_response = requests.get(credits_url, params={"api_key": API_KEY})
    if credits_response.status_code != 200:
        return None, None

    credits = credits_response.json()

    # 감독 정보 추출
    director = next((person['name'].replace(" ", "") for person in credits['crew'] if person['job'] == 'Director'), None)

    # 배우 정보 추출 (상위 3명)
    actors = " ".join(person['name'].replace(" ", "") for person in credits['cast'][:3])

    return director, actors

In [14]:
# 체크포인트 파일 경로 설정
save_folder = 'movies_processed_data'
save_file_name = f'movies_checkpoint.csv'

checkpoint_file = f"{mount_directory}/{save_folder}/{save_file_name}"

# 폴더 경로 생성
full_save_path = f"{mount_directory}/{save_folder}"

# 폴더가 없는 경우 생성
if not os.path.exists(full_save_path):
    os.makedirs(full_save_path)
    print(f"폴더를 생성했습니다: {full_save_path}")

# 체크포인트 데이터 로드
if os.path.exists(checkpoint_file):
    print(f"체크포인트 파일 로드 중: {checkpoint_file}")
    df_movies = pd.read_csv(checkpoint_file)
    print("체크포인트에서 이어서 작업을 진행합니다.")
else:
    print("새로운 작업을 시작합니다.")
    # df_movies 초기화: 원본 DataFrame을 로드하세요.
    # 예: df_movies = pd.read_csv('/path/to/original/movies.csv')

    # 'director'와 'actors' 열이 없으면 추가
    if 'director' not in df_movies.columns:
        df_movies['director'] = None
    if 'actors' not in df_movies.columns:
        df_movies['actors'] = None

        # 'check' 열 추가: 크롤링이 완료된 영화에는 'done' 값을 설정
    if 'check' not in df_movies.columns:
        df_movies['check'] = None

체크포인트 파일 로드 중: drive/MyDrive/movies_processed_data/movies_checkpoint.csv
체크포인트에서 이어서 작업을 진행합니다.


In [15]:
# 크롤링 작업
batch_size = 1000
total_batches = (len(df_movies) + batch_size - 1) // batch_size  # 정확히 1000개 단위로 분할

for batch_num in range(total_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(df_movies))

    # 처리해야 할 데이터 필터링: 'check'가 None인 데이터만 처리
    batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]

    if batch_to_process.empty:
        print(f"Batch {batch_num + 1}/{total_batches} 스킵 (이미 처리됨)")
        continue

    print(f"Batch {batch_num + 1}/{total_batches} 진행 중 ({start_idx} ~ {end_idx - 1})")

    for idx, row in tqdm(batch_to_process.iterrows(), total=batch_to_process.shape[0]):
        try:
            # 감독, 배우 정보 크롤링
            director, actor_list = get_movie_credits(row['title'], row['year'])
            df_movies.at[idx, 'director'] = director
            df_movies.at[idx, 'actors'] = actor_list
            df_movies.at[idx, 'check'] = 'done'  # 크롤링 완료된 항목은 'done'으로 표시
        except Exception as e:
            print(f"오류 발생 (index={idx}): {e}")
            df_movies.at[idx, 'check'] = 'error'  # 오류가 발생하면 'error'로 표시

    # 체크포인트 저장
    df_movies.to_csv(checkpoint_file, index=False, encoding='utf-8')
    print(f"Batch {batch_num + 1} 완료: 체크포인트 저장됨 ({checkpoint_file})")

print("모든 작업 완료!")

  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]

Batch 1/88 스킵 (이미 처리됨)
Batch 2/88 스킵 (이미 처리됨)
Batch 3/88 스킵 (이미 처리됨)
Batch 4/88 스킵 (이미 처리됨)
Batch 5/88 스킵 (이미 처리됨)
Batch 6/88 스킵 (이미 처리됨)
Batch 7/88 스킵 (이미 처리됨)
Batch 8/88 스킵 (이미 처리됨)
Batch 9/88 스킵 (이미 처리됨)
Batch 10/88 스킵 (이미 처리됨)
Batch 11/88 스킵 (이미 처리됨)
Batch 12/88 스킵 (이미 처리됨)
Batch 13/88 스킵 (이미 처리됨)
Batch 14/88 스킵 (이미 처리됨)
Batch 15/88 스킵 (이미 처리됨)
Batch 16/88 스킵 (이미 처리됨)
Batch 17/88 스킵 (이미 처리됨)
Batch 18/88 스킵 (이미 처리됨)
Batch 19/88 스킵 (이미 처리됨)
Batch 20/88 스킵 (이미 처리됨)
Batch 21/88 스킵 (이미 처리됨)
Batch 22/88 스킵 (이미 처리됨)
Batch 23/88 스킵 (이미 처리됨)
Batch 24/88 스킵 (이미 처리됨)
Batch 25/88 스킵 (이미 처리됨)
Batch 26/88 진행 중 (25000 ~ 25999)


100%|██████████| 1000/1000 [05:06<00:00,  3.26it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 26 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 27/88 진행 중 (26000 ~ 26999)


100%|██████████| 1000/1000 [05:01<00:00,  3.32it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 27 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 28/88 진행 중 (27000 ~ 27999)


100%|██████████| 1000/1000 [05:02<00:00,  3.31it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 28 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 29/88 진행 중 (28000 ~ 28999)


100%|██████████| 1000/1000 [04:56<00:00,  3.37it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 29 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 30/88 진행 중 (29000 ~ 29999)


100%|██████████| 1000/1000 [04:56<00:00,  3.37it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 30 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 31/88 진행 중 (30000 ~ 30999)


100%|██████████| 1000/1000 [05:10<00:00,  3.22it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 31 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 32/88 진행 중 (31000 ~ 31999)


100%|██████████| 1000/1000 [04:56<00:00,  3.38it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 32 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 33/88 진행 중 (32000 ~ 32999)


100%|██████████| 1000/1000 [05:00<00:00,  3.33it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 33 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 34/88 진행 중 (33000 ~ 33999)


100%|██████████| 1000/1000 [04:56<00:00,  3.37it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 34 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 35/88 진행 중 (34000 ~ 34999)


100%|██████████| 1000/1000 [04:56<00:00,  3.37it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 35 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 36/88 진행 중 (35000 ~ 35999)


100%|██████████| 1000/1000 [04:55<00:00,  3.39it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 36 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 37/88 진행 중 (36000 ~ 36999)


100%|██████████| 1000/1000 [04:57<00:00,  3.36it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 37 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 38/88 진행 중 (37000 ~ 37999)


100%|██████████| 1000/1000 [04:56<00:00,  3.37it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 38 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 39/88 진행 중 (38000 ~ 38999)


100%|██████████| 1000/1000 [05:11<00:00,  3.21it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 39 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 40/88 진행 중 (39000 ~ 39999)


100%|██████████| 1000/1000 [05:04<00:00,  3.28it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 40 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 41/88 진행 중 (40000 ~ 40999)


100%|██████████| 1000/1000 [05:08<00:00,  3.24it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 41 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 42/88 진행 중 (41000 ~ 41999)


100%|██████████| 1000/1000 [05:13<00:00,  3.19it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 42 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 43/88 진행 중 (42000 ~ 42999)


100%|██████████| 1000/1000 [05:15<00:00,  3.17it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 43 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 44/88 진행 중 (43000 ~ 43999)


100%|██████████| 1000/1000 [04:59<00:00,  3.33it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 44 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 45/88 진행 중 (44000 ~ 44999)


100%|██████████| 1000/1000 [04:59<00:00,  3.34it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 45 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 46/88 진행 중 (45000 ~ 45999)


100%|██████████| 1000/1000 [04:59<00:00,  3.34it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 46 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 47/88 진행 중 (46000 ~ 46999)


100%|██████████| 1000/1000 [06:12<00:00,  2.68it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 47 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 48/88 진행 중 (47000 ~ 47999)


100%|██████████| 1000/1000 [05:15<00:00,  3.17it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 48 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 49/88 진행 중 (48000 ~ 48999)


100%|██████████| 1000/1000 [05:02<00:00,  3.31it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 49 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 50/88 진행 중 (49000 ~ 49999)


100%|██████████| 1000/1000 [05:10<00:00,  3.22it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 50 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 51/88 진행 중 (50000 ~ 50999)


100%|██████████| 1000/1000 [05:01<00:00,  3.32it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 51 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 52/88 진행 중 (51000 ~ 51999)


100%|██████████| 1000/1000 [04:53<00:00,  3.41it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 52 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 53/88 진행 중 (52000 ~ 52999)


100%|██████████| 1000/1000 [05:04<00:00,  3.28it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 53 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 54/88 진행 중 (53000 ~ 53999)


100%|██████████| 1000/1000 [05:10<00:00,  3.22it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 54 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 55/88 진행 중 (54000 ~ 54999)


100%|██████████| 1000/1000 [05:14<00:00,  3.17it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 55 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 56/88 진행 중 (55000 ~ 55999)


100%|██████████| 1000/1000 [05:11<00:00,  3.21it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 56 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 57/88 진행 중 (56000 ~ 56999)


100%|██████████| 1000/1000 [04:59<00:00,  3.33it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 57 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 58/88 진행 중 (57000 ~ 57999)


100%|██████████| 1000/1000 [05:01<00:00,  3.32it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 58 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 59/88 진행 중 (58000 ~ 58999)


100%|██████████| 1000/1000 [05:07<00:00,  3.25it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 59 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 60/88 진행 중 (59000 ~ 59999)


100%|██████████| 1000/1000 [05:15<00:00,  3.17it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 60 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 61/88 진행 중 (60000 ~ 60999)


100%|██████████| 1000/1000 [05:25<00:00,  3.07it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 61 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 62/88 진행 중 (61000 ~ 61999)


100%|██████████| 1000/1000 [05:11<00:00,  3.21it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 62 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 63/88 진행 중 (62000 ~ 62999)


100%|██████████| 1000/1000 [04:51<00:00,  3.43it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 63 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 64/88 진행 중 (63000 ~ 63999)


100%|██████████| 1000/1000 [05:10<00:00,  3.22it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 64 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 65/88 진행 중 (64000 ~ 64999)


100%|██████████| 1000/1000 [05:07<00:00,  3.26it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 65 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 66/88 진행 중 (65000 ~ 65999)


100%|██████████| 1000/1000 [05:05<00:00,  3.28it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 66 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 67/88 진행 중 (66000 ~ 66999)


100%|██████████| 1000/1000 [05:18<00:00,  3.14it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 67 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 68/88 진행 중 (67000 ~ 67999)


100%|██████████| 1000/1000 [04:53<00:00,  3.41it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 68 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 69/88 진행 중 (68000 ~ 68999)


100%|██████████| 1000/1000 [05:02<00:00,  3.31it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 69 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 70/88 진행 중 (69000 ~ 69999)


100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 70 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 71/88 진행 중 (70000 ~ 70999)


100%|██████████| 1000/1000 [04:50<00:00,  3.44it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 71 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 72/88 진행 중 (71000 ~ 71999)


100%|██████████| 1000/1000 [05:00<00:00,  3.33it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 72 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 73/88 진행 중 (72000 ~ 72999)


100%|██████████| 1000/1000 [05:01<00:00,  3.32it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 73 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 74/88 진행 중 (73000 ~ 73999)


100%|██████████| 1000/1000 [05:03<00:00,  3.29it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 74 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 75/88 진행 중 (74000 ~ 74999)


100%|██████████| 1000/1000 [05:06<00:00,  3.26it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 75 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 76/88 진행 중 (75000 ~ 75999)


100%|██████████| 1000/1000 [04:58<00:00,  3.35it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 76 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 77/88 진행 중 (76000 ~ 76999)


100%|██████████| 1000/1000 [05:01<00:00,  3.32it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 77 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 78/88 진행 중 (77000 ~ 77999)


100%|██████████| 1000/1000 [05:35<00:00,  2.98it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 78 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 79/88 진행 중 (78000 ~ 78999)


100%|██████████| 1000/1000 [05:16<00:00,  3.16it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 79 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 80/88 진행 중 (79000 ~ 79999)


100%|██████████| 1000/1000 [05:15<00:00,  3.17it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 80 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 81/88 진행 중 (80000 ~ 80999)


100%|██████████| 1000/1000 [05:07<00:00,  3.25it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 81 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 82/88 진행 중 (81000 ~ 81999)


100%|██████████| 1000/1000 [05:32<00:00,  3.01it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 82 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 83/88 진행 중 (82000 ~ 82999)


100%|██████████| 1000/1000 [05:08<00:00,  3.24it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 83 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 84/88 진행 중 (83000 ~ 83999)


100%|██████████| 1000/1000 [05:17<00:00,  3.15it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 84 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 85/88 진행 중 (84000 ~ 84999)


100%|██████████| 1000/1000 [05:06<00:00,  3.26it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 85 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 86/88 진행 중 (85000 ~ 85999)


100%|██████████| 1000/1000 [05:08<00:00,  3.24it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 86 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 87/88 진행 중 (86000 ~ 86999)


100%|██████████| 1000/1000 [05:22<00:00,  3.10it/s]
  batch_to_process = df_movies[start_idx:end_idx][df_movies['check'].isna()]


Batch 87 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
Batch 88/88 진행 중 (87000 ~ 87584)


100%|██████████| 585/585 [03:05<00:00,  3.16it/s]


Batch 88 완료: 체크포인트 저장됨 (drive/MyDrive/movies_processed_data/movies_checkpoint.csv)
모든 작업 완료!


In [18]:
df_movies.drop(['check'], axis=1, inplace=True)
df_movies

Unnamed: 0,movieId,title,genres,year,director,actors
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995.0,JohnLasseter,TomHanks TimAllen DonRickles
1,2,Jumanji,Adventure Children Fantasy,1995.0,JoeJohnston,RobinWilliams KirstenDunst BradleyPierce
2,3,Grumpier Old Men,Comedy Romance,1995.0,HowardDeutch,WalterMatthau JackLemmon Ann-Margret
3,4,Waiting to Exhale,Comedy Drama Romance,1995.0,ForestWhitaker,WhitneyHouston AngelaBassett LorettaDevine
4,5,Father of the Bride Part II,Comedy,1995.0,CharlesShyer,SteveMartin DianeKeaton MartinShort
...,...,...,...,...,...,...
87580,292731,The Monroy Affaire,Drama,2022.0,JosuéMéndez,DamiánAlcázar GrapaPaola MaríaZubiri
87581,292737,Shelter in Solitude,Comedy Drama,2023.0,VibekeMuasya,SiobhanFallonHogan PeterMacon RobertPatrick
87582,292753,Orca,Drama,2023.0,SaharMosayebi,TaranehAlidoosti MahtabKeramati MasoudKaramati
87583,292755,The Angry Breed,Drama,1968.0,DavidCommons,JanSterling JamesMacArthur WilliamWindom


# 크롤링한 데이터 csv 형식으로 Google Drive에 저장

In [19]:
# Google Drive 저장 경로 설정
save_folder = 'movies_processed_data'
save_file_name = f'movies_processed_{dataset_volume}.csv'

# 폴더 경로 생성
full_save_path = f"{mount_directory}/{save_folder}"

# 폴더가 없는 경우 생성
if not os.path.exists(full_save_path):
    os.makedirs(full_save_path)
    print(f"폴더를 생성했습니다: {full_save_path}")

# DataFrame을 CSV로 저장
df_movies.to_csv(f"{full_save_path}/{save_file_name}", index=False, encoding='utf-8')

print(f"CSV 파일이 저장되었습니다: {full_save_path}/{save_file_name}")

CSV 파일이 저장되었습니다: drive/MyDrive/movies_processed_data/movies_processed_32m.csv
