## Step 1. Load the Data

In [25]:
import pandas as pd

file_path = r'C:\Users\user\Downloads\2024 솔챌\유투브영상수집.xlsx'
df = pd.read_excel(file_path)


In [26]:
#pip install openpyxl

## Step 2. Define Input Processing

In [27]:
# List of all emotion and event columns in the dataset
emotion_columns = ['angry', 'sad', 'anxious', 'hurt', 'embarrassed', 'happy']
event_columns = ['relationship', 'breakup', 'family', 'work', 'studying', 'insomnia', 'people', 'health', 'self-esteem', 'money', 'friends']

# Function to concatenate applicable tags for each video
def create_tags(row):
    tags = []
    for col in emotion_columns + event_columns:
        if row[col] == 1:  # Assuming 1 indicates an applicable tag
            tags.append(col)
    return ' '.join(tags)  # Concatenating all tags into a single string

# Applying the function to each row in the dataframe to create the 'tags' column
df['tags'] = df.apply(create_tags, axis=1)

## Step 3. Implement the Recommendation Logic

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [29]:
# 영상에 클러스터 라벨링 
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['tags'])
kmeans = KMeans(n_clusters=20)
kmeans.fit(X)
df['cluster'] = kmeans.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
df['cluster']

0     17
1     17
2     17
3     17
4     17
      ..
84     2
85     2
86     2
87     2
88    18
Name: cluster, Length: 89, dtype: int32

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

def vectorize_input_tags(input_tags, vectorizer): 
    """
    입력 태그를 벡터화합니다.
    """
    return vectorizer.transform([" ".join(input_tags)])

def calculate_cluster_distances(input_vec, cluster_centers):
    """
    입력 벡터와 각 클러스터 중심과의 유클리드 거리를 계산합니다.
    """
    distances = euclidean_distances(input_vec, cluster_centers)
    return distances.flatten()

def recommend_clusters(input_tags, vectorizer, kmeans):
    """
    입력 태그에 기반하여 클러스터를 추천합니다.
    """
    # 입력 태그 벡터화
    input_vec = vectorize_input_tags(input_tags, vectorizer)
    
    # 클러스터 중심과의 거리 계산
    distances = calculate_cluster_distances(input_vec, kmeans.cluster_centers_)
    
    # 거리에 기반한 유사성 점수 계산 
    scores = 1 / (1 + distances)  
    
    # 유사성 점수에 기반하여 클러스터 추천
    recommended_cluster_indices = np.argsort(scores)[::-1]  # 점수가 높은 순으로 정렬
    
    scores_dict = {cluster_index: scores[cluster_index] for cluster_index in recommended_cluster_indices if scores[cluster_index] > 0}

    return scores_dict

In [32]:
def recommend_videos(df, cluster_scores, kmeans):
    """
    유사도 점수가 높은 클러스터에 해당하는 영상을 추천합니다.
    """
    recommend = []
    for cluster_index in sorted(cluster_scores, key=cluster_scores.get, reverse=True):
        # 클러스터에 속한 영상 필터링
        cluster_videos = df[df['cluster'] == cluster_index]
        
        # 클러스터 중심과의 거리 계산
        distances = calculate_cluster_distances(vectorizer.transform(cluster_videos['tags']), [kmeans.cluster_centers_[cluster_index]])
        
        # 거리에 따라 영상 정렬
        cluster_videos['distance'] = distances
        sorted_videos = cluster_videos.sort_values(by='distance')
        
        # 정렬된 영상 링크 추가
        recommend.extend(sorted_videos['링크'].tolist())
    
    return recommend

## Step 4. Apply the Algorithm

In [37]:
input_tags = ['angry','friends']
cluster_scores = recommend_clusters(input_tags, vectorizer, kmeans)
recommended_videos = recommend_videos(df, cluster_scores, kmeans)

# 추천된 영상 링크 출력
for url in recommended_videos[:5]:  # 상위 5개 추천 영상 출력
    print(url)


https://youtu.be/mLLtYA-VcAA?feature=shared
https://youtu.be/9_hFr1C-xhw?feature=shared
https://youtu.be/m6BHmR4UME0?feature=shared
https://youtu.be/x4kNrszQ9Kg?feature=shared
https://youtu.be/YtYIb8Qy8mw?feature=shared


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_videos['distance'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_videos['distance'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_videos['distance'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in