## Step 1. Load the Data

In [1]:
import pandas as pd

file_path = r'C:\Users\user\Downloads\2024 솔챌\유투브영상수집.xlsx'
df = pd.read_excel(file_path)


In [2]:
#pip install openpyxl

## Step 2. Define Input Processing

In [3]:
# List of all emotion and event columns in the dataset
emotion_columns = ['angry', 'sad', 'anxious', 'hurt', 'embarrassed', 'happy']
event_columns = ['relationship', 'breakup', 'family', 'work', 'studying', 'insomnia', 'people', 'health', 'self-esteem', 'money', 'friends']

# Function to concatenate applicable tags for each video
def create_tags(row):
    tags = []
    for col in emotion_columns + event_columns:
        if row[col] == 1:  
            tags.append(col)
    return ' '.join(tags) 

df['tags'] = df.apply(create_tags, axis=1)

## Step 3. Implement the Recommendation Logic

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [19]:
# 영상에 클러스터 라벨링 
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['tags'])
kmeans = KMeans(n_clusters=6)
kmeans.fit(X)
df['cluster'] = kmeans.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [8]:
df['cluster']

0     5
1     5
2     5
3     5
4     5
     ..
84    4
85    4
86    4
87    4
88    3
Name: cluster, Length: 89, dtype: int32

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

# 입력 태그를 벡터화
def vectorize_input_tags(input_tags, vectorizer): 

    return vectorizer.transform([" ".join(input_tags)])

# 입력 벡터와 각 클러스터 중심과의 유클리드 거리 계산
def calculate_cluster_distances(input_vec, cluster_centers):

    distances = euclidean_distances(input_vec, cluster_centers)
    return distances.flatten()

def recommend_clusters(input_tags, vectorizer, kmeans):

    # 입력 태그 벡터화
    input_vec = vectorize_input_tags(input_tags, vectorizer)
    
    # 클러스터 중심과의 거리 계산
    distances = calculate_cluster_distances(input_vec, kmeans.cluster_centers_)
    
    # 거리에 기반한 유사성 점수 계산 
    scores = 1 / (1 + distances)  
    
    # 클러스터 추클
    recommended_cluster_indices = np.argsort(scores)[::-1] 
    
    scores_dict = {cluster_index: scores[cluster_index] for cluster_index in recommended_cluster_indices if scores[cluster_index] > 0}

    return scores_dict

In [21]:
def recommend_videos(df, cluster_scores, kmeans):

    recommend = []
    for cluster_index in sorted(cluster_scores, key=cluster_scores.get, reverse=True):
        # 클러스터에 속한 영상 필터링
        cluster_videos = df[df['cluster'] == cluster_index]
        
        # 클러스터 중심과의 거리 계산
        distances = calculate_cluster_distances(vectorizer.transform(cluster_videos['tags']), [kmeans.cluster_centers_[cluster_index]])
        
        # 거리에 따라 영상 정렬
        cluster_videos['distance'] = distances
        sorted_videos = cluster_videos.sort_values(by='distance')
        
        # 정렬된 영상 링크 추가
        recommend.extend(sorted_videos['링크'].tolist())
    
    return recommend

## Step 4. Apply the Algorithm

In [22]:
input_tags = ['breakup']
cluster_scores = recommend_clusters(input_tags, vectorizer, kmeans)
recommended_videos = recommend_videos(df, cluster_scores, kmeans)

for url in recommended_videos[:5]:  
    print(url)


https://youtu.be/rX571wvjQcE?si=JboH280V13mFY32R
https://youtu.be/wc9_eeKCeQw?si=N5OlT_xDVPLCud0V
https://youtu.be/7XCx1XcVP5w?feature=shared
https://youtu.be/BNEOKti4BkE?feature=shared
https://youtu.be/0KfrNy4bUrU?feature=shared


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_videos['distance'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_videos['distance'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_videos['distance'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in