In [12]:
# General libraries
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import linear_kernel


# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.data import path as nltk_data_path
from nltk.corpus import wordnet
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Korean NLP library
from konlpy.tag import Okt

# Visualization and miscellaneous
import operator
import statistics

# Configuration for NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk_data_path.append('C:/nltk_data')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mycom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mycom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mycom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mycom\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
import pandas as pd

# 데이터 불러오기
students_books_df = pd.read_csv('data/book_rent.csv')
books_df = pd.read_csv('data/book_list(2).csv')

In [3]:
students_books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   student_id  20000 non-null  int64
 1   book_id     20000 non-null  int64
dtypes: int64(2)
memory usage: 312.6 KB


In [4]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9862 entries, 0 to 9861
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   서지번호    9862 non-null   int64 
 1   서명      9862 non-null   object
 2   저자      9829 non-null   object
 3   발행처     9859 non-null   object
 4   출판년도    9859 non-null   object
 5   ISBN    7894 non-null   object
 6   청구기호    9862 non-null   int64 
 7   언어      9862 non-null   object
 8   서지유형    9862 non-null   object
dtypes: int64(2), object(7)
memory usage: 693.6+ KB


In [5]:
books_df['청구기호'] = books_df['청구기호'].astype(str)

books_df['대분류'] = books_df['청구기호'].str[0]
books_df['소분류'] = books_df['청구기호'].str[:2]

In [6]:
books_df['대분류'] = books_df['대분류'].astype(str)
books_df['소분류'] = books_df['소분류'].astype(str)

books_df = books_df.drop(labels='청구기호',axis=1)

In [7]:
import re  # 정규 표현식 모듈

In [8]:
# 컬럼 이름 변경 및 데이터 병합
books_df = books_df.rename(columns={'서지번호': 'book_id'})
data = students_books_df.merge(books_df, how='inner', on='book_id')

# 책 정보 텍스트 전처리 함수
def preprocess_text_multilingual(text, language='en'):
    if language == 'en':
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(tokens)
    elif language == 'ko':
        text = re.sub(r'[^\uac00-\ud7a3\s]', '', text)
        okt = Okt()
        tokens = okt.morphs(text, stem=True)
        stop_words = set(['은', '는', '이', '가', '를', '에', '의', '도', '으로', '그리고', '하지만', '또는'])
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    else:
        raise ValueError("Unsupported language. Use 'en' for English or 'ko' for Korean.")

In [None]:
# 주요 컬럼 전처리
books_df['저자'] = books_df['저자'].fillna('').astype(str).apply(lambda x: preprocess_text_multilingual(x, language='ko'))
books_df['발행처'] = books_df['발행처'].fillna('').astype(str).apply(lambda x: preprocess_text_multilingual(x, language='ko'))
books_df['출판년도'] = books_df['출판년도'].fillna('').astype(str).apply(lambda x: preprocess_text_multilingual(x, language='ko'))

# 책 속성 결합
books_df['book_features'] = (
    books_df['서명'].fillna('') + ' ' +
    books_df['저자'] + ' ' +
    books_df['발행처'] + ' ' +
    books_df['서지유형'].fillna('') + ' ' +
    books_df['대분류'].fillna('') + ' ' +
    books_df['소분류'].fillna('')
)

# 결합한 데이터에서 결측값 처리
books_df['book_features'] = books_df['book_features'].fillna('')


# TF-IDF 벡터화
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(books_df['book_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Doc2Vec 벡터 변환
documents = [TaggedDocument(words=content.split(), tags=[str(i)]) for i, content in enumerate(books_df['book_features'])]
doc2vec_model = Doc2Vec(documents, vector_size=200, window=5, min_count=2, epochs=40)
content_vectors = np.array([doc2vec_model.dv[str(i)] for i in range(len(books_df['book_features']))])

# TF-IDF와 Doc2Vec 기반 유사도 계산
article_similarity_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix)
article_similarity_doc2vec = cosine_similarity(content_vectors)
article_similarity_total = 0.5 * article_similarity_tfidf + 0.5 * article_similarity_doc2vec

# 학생-책 행렬 생성 (유저-아이템 행렬)
students_books_df['interaction'] = 1
user_article_matrix = students_books_df.pivot_table(index='student_id', columns='book_id', values='interaction', fill_value=0).values

In [14]:
# Step 1: book_id 매핑
book_id_mapping = {book_id: idx for idx, book_id in enumerate(students_books_df['book_id'].unique())}
students_books_df['mapped_book_id'] = students_books_df['book_id'].map(book_id_mapping)

# Step 2: article_similarity_total의 크기에 맞춰 book_id_mapping 확장
expected_book_ids = range(article_similarity_total.shape[0])
for book_id in expected_book_ids:
    if book_id not in book_id_mapping:
        book_id_mapping[book_id] = len(book_id_mapping)

# Step 3: 확장된 매트릭스 초기화
num_books_total = article_similarity_total.shape[0]
expanded_matrix = np.zeros((user_article_matrix.shape[0], num_books_total))

# Step 4: user_article_matrix 데이터를 확장된 매트릭스에 복사
for book_id, idx in book_id_mapping.items():
    # 유효한 book_id와 user_article_matrix의 열 범위를 체크
    if idx < user_article_matrix.shape[1] and book_id < expanded_matrix.shape[1]:
        expanded_matrix[:, book_id] = user_article_matrix[:, idx]

# Step 5: user_article_matrix 업데이트 및 크기 검증
user_article_matrix = expanded_matrix
assert user_article_matrix.shape[1] == article_similarity_total.shape[0], "행렬의 열 크기가 여전히 불일치합니다."

# Debugging: 최종 크기 출력
print("user_article_matrix.shape:", user_article_matrix.shape)
print("article_similarity_total.shape:", article_similarity_total.shape)

user_article_matrix.shape: (500, 9862)
article_similarity_total.shape: (9862, 9862)


In [15]:
# 누락된 book_id 확인
missing_books = set(range(article_similarity_total.shape[0])) - set(students_books_df['mapped_book_id'].unique())
print("누락된 book_id:", missing_books)

누락된 book_id: {8656, 8657, 8658, 8659, 8660, 8661, 8662, 8663, 8664, 8665, 8666, 8667, 8668, 8669, 8670, 8671, 8672, 8673, 8674, 8675, 8676, 8677, 8678, 8679, 8680, 8681, 8682, 8683, 8684, 8685, 8686, 8687, 8688, 8689, 8690, 8691, 8692, 8693, 8694, 8695, 8696, 8697, 8698, 8699, 8700, 8701, 8702, 8703, 8704, 8705, 8706, 8707, 8708, 8709, 8710, 8711, 8712, 8713, 8714, 8715, 8716, 8717, 8718, 8719, 8720, 8721, 8722, 8723, 8724, 8725, 8726, 8727, 8728, 8729, 8730, 8731, 8732, 8733, 8734, 8735, 8736, 8737, 8738, 8739, 8740, 8741, 8742, 8743, 8744, 8745, 8746, 8747, 8748, 8749, 8750, 8751, 8752, 8753, 8754, 8755, 8756, 8757, 8758, 8759, 8760, 8761, 8762, 8763, 8764, 8765, 8766, 8767, 8768, 8769, 8770, 8771, 8772, 8773, 8774, 8775, 8776, 8777, 8778, 8779, 8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8788, 8789, 8790, 8791, 8792, 8793, 8794, 8795, 8796, 8797, 8798, 8799, 8800, 8801, 8802, 8803, 8804, 8805, 8806, 8807, 8808, 8809, 8810, 8811, 8812, 8813, 8814, 8815, 8816, 8817, 8818, 8819, 88

In [16]:
# Step 1: 모든 책 ID를 0부터 len(all_books)-1로 제한
all_books = range(article_similarity_total.shape[0])  # 0부터 9861까지
book_id_mapping = {book_id: idx for idx, book_id in enumerate(all_books)}

# Step 2: 확장된 매트릭스 초기화
expanded_matrix = np.zeros((user_article_matrix.shape[0], len(all_books)))

# Step 3: book_id 범위 내에서 매트릭스 데이터 복사
for i, book_id in enumerate(students_books_df['book_id'].unique()):
    if book_id < len(all_books):  # 범위 확인
        expanded_matrix[:, book_id] = user_article_matrix[:, i]

# Step 4: user_article_matrix 업데이트
user_article_matrix = expanded_matrix

# Debugging: 최종 크기 확인
print("user_article_matrix.shape:", user_article_matrix.shape)
print("article_similarity_total.shape:", article_similarity_total.shape)

user_article_matrix.shape: (500, 9862)
article_similarity_total.shape: (9862, 9862)


In [17]:
print(user_article_matrix.shape)  # (500, book_id 개수)
print(article_similarity_total.shape)  # (book_id 개수, book_id 개수)

(500, 9862)
(9862, 9862)


In [18]:
# 추천 점수 계산
article_predicted_scores = np.dot(user_article_matrix, article_similarity_total)

# 점수 정규화
scaler = StandardScaler()
article_predicted_scores_normalized = scaler.fit_transform(article_predicted_scores)

In [19]:
# 상위 5개의 책 추천
def recommend_books_for_student(student_id, top_n=5):
    # 학생 인덱스 찾기
    student_idx = students_books_df['student_id'].unique().tolist().index(student_id)
    
    # 추천 점수 가져오기
    scores = article_predicted_scores_normalized[student_idx]
    
    # 상위 점수 책 인덱스 추출
    recommended_indices = scores.argsort()[-top_n:][::-1]
    
    # 추천된 책 정보 반환
    recommended_books = books_df.iloc[recommended_indices]
    return recommended_books[['book_id', '저자', '발행처', '출판년도']]

# 예시: 학생 ID가 1인 경우 상위 5개 추천
student_id = 1
recommended_books = recommend_books_for_student(student_id)
print(recommended_books)


      book_id    저자        발행처 출판년도
1265   122126  박 신흥    경 인문 화사     
4298   395521  김덕 진   다 하다 미디어     
5541  1130379   박영준  랜덤하우스 코리아     
7499  4452089             학 지사     
271    331967   정길화    해내다 출판사     


In [20]:
# 개인화된 추천 점수 계산
def personalized_recommend_books(student_id, top_n=5):
    # 학생이 읽은 책 목록 가져오기
    student_books = students_books_df[students_books_df['student_id'] == student_id]['book_id'].tolist()
    
    # 학생이 읽은 책들의 속성 벡터 평균 계산
    read_books_indices = books_df[books_df['book_id'].isin(student_books)].index
    if len(read_books_indices) > 0:
        # 읽은 책들의 유사도 평균 (TF-IDF, Doc2Vec 기반)
        personalized_vector = np.mean(article_similarity_total[read_books_indices], axis=0)
    else:
        # 읽은 책이 없을 경우 전체 평균 사용
        personalized_vector = np.mean(article_similarity_total, axis=0)
    
    # 개인화 점수 기반 추천 계산
    scores = np.dot(personalized_vector, article_similarity_total.T)
    
    # 상위 점수 책 인덱스 추출
    recommended_indices = scores.argsort()[-top_n:][::-1]
    
    # 추천된 책 정보 반환
    recommended_books = books_df.iloc[recommended_indices]
    return recommended_books[['book_id', '저자', '발행처', '출판년도']]

# 예시: 학생 ID가 1인 경우 개인화된 상위 5개 추천
student_id = 1
personalized_books = personalized_recommend_books(student_id)
print(personalized_books)

      book_id   저자     발행처 출판년도
8212  1372728  김현옥      밀레     
1947  1803565  김정희    학 지사     
2566    44285                  
431   1898707  강세황  지식 산업사     
1048   156957         책 세상     


In [21]:
# 추천 함수
def recommend_books(title, books_df, cosine_sim):
    # 입력 서명의 인덱스
    if title not in books_df['서명'].values:
        print("입력한 책 제목이 데이터에 없습니다.")
        return pd.DataFrame()
    
    idx = books_df[books_df['서명'] == title].index[0]
    
    # 유사도 점수 정렬
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:6]
    book_indices = [i[0] for i in sim_scores]
    return df.iloc[book_indices]

In [22]:
# 평가를 위한 데이터 분리
train_df, test_df = train_test_split(students_books_df, test_size=0.2, random_state=42)

In [23]:
# Precision, Recall, F1-Score 계산
def evaluate_recommender_for_student():
    y_true = []
    y_pred = []
    
    for _, row in test_df.iterrows():
        student_id = row['student_id']
        true_book_id = row['book_id']
        
        # 학생에게 추천된 책
        recommended_books = recommend_books_for_student(student_id, top_n=5)['book_id'].values
        
        # 평가: 추천된 책에 실제 책이 포함되어 있는지 여부
        y_true.append(true_book_id in recommended_books)
        y_pred.append(1)  # 추천 시스템이 책을 추천했다고 가정
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

# 평가 실행
evaluate_recommender_for_student()

Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# Precision, Recall, F1-Score 계산
def personalized_evaluate_recommender():
    y_true = []
    y_pred = []
    
    for _, row in test_df.iterrows():
        student_id = row['student_id']
        true_book_id = row['book_id']
        
        # 학생에게 추천된 책
        recommended_books = personalized_recommend_books(student_id, top_n=5)['book_id'].values
        
        # 평가: 추천된 책에 실제 책이 포함되어 있는지 여부
        y_true.append(true_book_id in recommended_books)
        y_pred.append(1)  # 추천 시스템이 책을 추천했다고 가정
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

# 평가 실행
personalized_evaluate_recommender()

Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
# Precision, Recall, F1-Score 계산
def recommend_books():
    y_true = []
    y_pred = []
    
    for _, row in test_df.iterrows():
        student_id = row['student_id']
        true_book_id = row['book_id']
        
        # 학생에게 추천된 책
        recommended_books = personalized_recommend_books(student_id, top_n=5)['book_id'].values
        
        # 평가: 추천된 책에 실제 책이 포함되어 있는지 여부
        y_true.append(true_book_id in recommended_books)
        y_pred.append(1)  # 추천 시스템이 책을 추천했다고 가정
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

# 평가 실행
personalized_evaluate_recommender()

Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
