In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Content based analysis

## 데이터 불러오기

In [2]:
# book data load
books_data = pd.read_csv('books_with_genres.csv')
books_data.head(3)

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,genre
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"Young Adult, Fiction, Fantasy, Science Fiction..."
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,"Young Adult, Magic, Childrens, Middle Grade, A..."
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,"Fantasy, Young Adult, Romance, Fiction, Vampir..."


In [3]:
import os

# movie data load
current_dir = os.getcwd()
data_path = os.path.join(current_dir, "../project_dataset/movies/movie.csv")
movies_data = pd.read_csv(data_path)

# # movie tag data load
# data_path = os.path.join(current_dir, "../project_dataset/movies/tag.csv")
# movies_tag = pd.read_csv(data_path)

# # merge
# movies_data = pd.merge(movies_data, movies_tag, on='movieId', how='inner')
# tag 데이터를 포함하면 메모리 초과가 발생해 tag 데이터는 포함하지 않았습니다.
movies_data.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# 결측치 처리 (결측치가 있는 행 제거)
books_data.dropna(inplace=True)
movies_data.dropna(inplace=True)

## content 결합

In [6]:
import re
import nltk
from nltk.corpus import stopwords

# NLTK 리소스 다운로드 (불용어 사용)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rin02\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# 불용어 처리
stop_words = set(stopwords.words('english'))

# 전처리 함수: 텍스트 결합 및 불용어 제거 (태그 데이터 제외)
def preprocess_text(title, genres):
    if pd.isna(genres) or genres is None:
        genres = ""
    
    # 문자열을 모두 소문자로 변환
    text = f"{title} {genres}".lower()
    
    # '|'를 공백으로 바꾸어 장르를 단어로 분리
    text = text.replace('|', ' ')
    
    # 숫자, 특수 문자 제거
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 불용어 제거
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

movies_data['content'] = movies_data.apply(lambda row: preprocess_text(row['title'], row['genres']), axis=1)

In [8]:
# 전처리 함수: 텍스트 결합 및 불용어 제거
def preprocess_book_text(title, authors, genres):
    if pd.isna(genres) or genres is None:
        genres = ""
    if pd.isna(authors) or authors is None:
        authors = ""
    
    # 'genre' 열의 쉼표와 공백을 제거하여 일관성 있게 만듭니다
    genres = genres.replace(',', ' ').replace(' ', ' ')
    
    # 문자열을 모두 소문자로 변환
    text = f"{title} {authors} {genres}".lower()
    
    # 숫자, 특수 문자 제거
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 불용어 제거
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# books_data에 'content' 열 생성
books_data['content'] = books_data.apply(lambda row: preprocess_book_text(row['title'], row['authors'], row['genre']), axis=1)

### TF-IDF Vectorization

In [9]:
# TF-IDF 벡터화 (영화와 책을 별도로 벡터화)
vectorizer = TfidfVectorizer()

# 영화와 책 데이터를 별도로 벡터화하여 메모리 사용 최소화
movie_tfidf = vectorizer.fit_transform(movies_data['content'])
book_tfidf = vectorizer.transform(books_data['content'].fillna(''))  # 이미 영화 데이터에서 fit된 벡터라이저 사용

# 벡터화된 콘텐츠의 형태 확인
print(movie_tfidf.shape)  # (영화의 수, 단어의 수)
print(book_tfidf.shape)   # (책의 수, 단어의 수)

(27278, 21947)
(7643, 21947)


Calculate Cosine similarity

In [10]:
# 코사인 유사도 계산
cosine_sim = cosine_similarity(movie_tfidf, book_tfidf)

### recommendation function implement
추천함수 구현

In [11]:
def recommend_books_for_movie_by_name(movie_name, num_recommendations=5):
    # 영화 이름에 해당하는 영화 데이터를 찾기
    movie_idx = movies_data[movies_data['title'].str.contains(movie_name, case=False, na=False)].index
    
    if len(movie_idx) == 0:
        return f"Cannot find the movie '{movie_name}'."

    movie_idx = movie_idx[0]  # 첫 번째 검색된 영화만 사용
    
    # 해당 영화와 모든 책들에 대한 유사도
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    
    # 유사도 높은 책들을 내림차순으로 정렬
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # 가장 유사한 책들의 인덱스 추출
    book_indices = [x[0] for x in sim_scores[:num_recommendations]]
    
    # 추천된 책들 반환
    return books_data['title'].iloc[book_indices]

In [12]:
movie_name = "World of Apu"
print(f"Books similar to the movie '{movie_name}':")
recommended_books = recommend_books_for_movie_by_name(movie_name)
print(recommended_books)

Books similar to the movie 'World of Apu':
1804               Night World, No. 3 (Night World, #7-9)
1080         Winter of the World (The Century Trilogy #2)
3674      Genghis Khan and the Making of the Modern World
143     Unbroken: A World War II Story of Survival, Re...
2049               Night World, No. 2 (Night World, #4-6)
Name: title, dtype: object
