# **1. Import Library**

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import joblib
import os
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

#**2. Data Understanding**

In [67]:
video = pd.read_csv("youtube_keywords_dataset.csv")
video

Unnamed: 0,Keyword,Title,Link
0,Anxious,This could be why you're depressed or anxious ...,https://www.youtube.com/watch?v=MB5IX-np5fE
1,Anxious,How to stop feeling anxious about anxiety | Ti...,https://www.youtube.com/watch?v=ZidGozDhOjg
2,Anxious,Decrease your anxiety by 20% with EFT Tapping ...,https://www.youtube.com/watch?v=mjDq53bzhHo
3,Anxious,Dr. Gabor Mat√© on how chronic anxiety begins. ...,https://www.youtube.com/watch?v=7DYIyJPNKoY
4,Anxious,What's normal anxiety -- and what's an anxiety...,https://www.youtube.com/watch?v=xsEJ6GeAGb0
...,...,...,...
149,Loneliness,The Cure for Loneliness,https://www.youtube.com/watch?v=kWcQyEHRudE
150,Loneliness,Loneliness vs being alone üôè #shorts,https://www.youtube.com/watch?v=c1QMe7JD9zo
151,Loneliness,How to Stop Feeling Lonely,https://www.youtube.com/watch?v=TUNuYhkALEY
152,Loneliness,4 Hidden Signs Of Loneliness,https://www.youtube.com/watch?v=jF_4YwpgqM4


In [68]:
video.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Keyword  154 non-null    object
 1   Title    154 non-null    object
 2   Link     154 non-null    object
dtypes: object(3)
memory usage: 3.7+ KB


In [69]:
video.describe()

Unnamed: 0,Keyword,Title,Link
count,154,154,154
unique,5,152,152
top,Anxious,This could be why you're depressed or anxious ...,https://www.youtube.com/watch?v=MB5IX-np5fE
freq,50,2,2


#**3. Data Preparation**

In [70]:
#Change youtube link into embed link
video['Link'] = video['Link'].apply(lambda x: x.replace('watch?v=', 'embed/'))

In [71]:
# Combine title and keywords
video['content'] = video['Title'] + ' ' + video['Keyword']

In [72]:
recom_features = video[['content', 'Link']]

recom_features

Unnamed: 0,content,Link
0,This could be why you're depressed or anxious ...,https://www.youtube.com/embed/MB5IX-np5fE
1,How to stop feeling anxious about anxiety | Ti...,https://www.youtube.com/embed/ZidGozDhOjg
2,Decrease your anxiety by 20% with EFT Tapping ...,https://www.youtube.com/embed/mjDq53bzhHo
3,Dr. Gabor Mat√© on how chronic anxiety begins. ...,https://www.youtube.com/embed/7DYIyJPNKoY
4,What's normal anxiety -- and what's an anxiety...,https://www.youtube.com/embed/xsEJ6GeAGb0
...,...,...
149,The Cure for Loneliness Loneliness,https://www.youtube.com/embed/kWcQyEHRudE
150,Loneliness vs being alone üôè #shorts Loneliness,https://www.youtube.com/embed/c1QMe7JD9zo
151,How to Stop Feeling Lonely Loneliness,https://www.youtube.com/embed/TUNuYhkALEY
152,4 Hidden Signs Of Loneliness Loneliness,https://www.youtube.com/embed/jF_4YwpgqM4


In [73]:
# Vectorization text features 'content
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
content_features = vectorizer.fit_transform(recom_features['content'])

#**4. Model Implementation**

In [74]:
def recommend_video(input_text, top_n=5, similarity_threshold=0.2):
  input_vector = vectorizer.transform([input_text])
  similarity_scores = cosine_similarity(input_vector, content_features).flatten()
  filtered_indices = np.where(similarity_scores >= similarity_threshold)[0]

  if len(filtered_indices) == 0:
    return pd.DataFrame({'Message': ['No relevant videos found']})

  sorted_indices = filtered_indices[np.argsort(similarity_scores[filtered_indices])[::-1]]
  final_indices = sorted_indices[:top_n]

  results = video.iloc[final_indices][['content', 'Link']].copy()
  results['Similarity'] = similarity_scores[final_indices]

  return results.reset_index(drop=True)

In [75]:
recommend_video(
    input_text='anxiety',
    top_n=10
)

Unnamed: 0,content,Link,Similarity
0,What is Anxiety? Anxious,https://www.youtube.com/embed/BVJkf8IuRjE,0.489469
1,ADHD or Anxiety? Anxious,https://www.youtube.com/embed/yJM_GeKdZ6c,0.253808
2,7 Signs Of Anxiety Anxious,https://www.youtube.com/embed/dM--lHKJeA4,0.2501
3,this is what anxiety feels like #shorts #adhd ...,https://www.youtube.com/embed/tnFPhNNo4Fw,0.245444
4,4 embarrassing anxiety symptoms #mentalhealth ...,https://www.youtube.com/embed/orfJPm6WuRE,0.243313
5,The 5 signs of social anxiety #socialanxiety #...,https://www.youtube.com/embed/zJ6yegVE7WM,0.239743
6,4 Physical Symptoms Caused by Your Anxiety #an...,https://www.youtube.com/embed/CYh8Q5JTXo4,0.205515


# **5. Evaluasi**

In [76]:
def precision_at_k(input_text, top_k=5, similarity_threshold=0.2):
    recommendation = recommend_video(input_text, top_n=None)

    if 'Message' in recommendation.columns:
        print(recommendation['Message'][0])
        return 0.0

    filtered = recommendation[recommendation['Similarity'] >= similarity_threshold]

    if len(filtered) < top_k:
        print(f"Warning: Only {len(filtered)} relevant videos found above threshold.")
        sampled = filtered.sample(n=len(filtered))
    else:
        sampled = filtered.sample(n=top_k)

    relevan = (sampled['Similarity'] >= similarity_threshold).sum()
    precision = relevan / top_k

    print(f"Precision@{top_k}: {precision:.2f} (Input: '{input_text}' match with {relevan} from {top_k} random recommendations)\n")
    print("Recommendation Details:")
    print(sampled[['content', 'Link', 'Similarity']])

    return precision


In [77]:
precision_at_k(
    input_text='loneliness',
    top_k=2
)

Precision@2: 1.00 (Input: 'loneliness' match with 2 from 2 random recommendations)

Recommendation Details:
                                             content  \
2             How to Deal With Loneliness Loneliness   
5  Being Alone is not the same as Being Lonely Lo...   

                                        Link  Similarity  
2  https://www.youtube.com/embed/LDMY7qtOPiI    0.458912  
5  https://www.youtube.com/embed/lxD9Y5fYdFw    0.409931  


np.float64(1.0)

#**6. Save Model**

In [78]:
def save_video_recommendation_model(vectorizer, content_features, video_df, folder_path='model'):
  os.makedirs(folder_path, exist_ok=True)

  joblib.dump(vectorizer, f'{folder_path}/vectorizer.pkl')
  joblib.dump(content_features, f'{folder_path}/content_features.pkl')
  joblib.dump(video_df, f'{folder_path}/video_recommedation_dataset.pkl')

  print(f"Models and data successfully loaded")

In [79]:
save_video_recommendation_model(vectorizer, content_features, video)

Models and data successfully loaded


In [80]:
def load_video_recommendation_model(folder_path='model'):
    vectorizer = joblib.load(f'{folder_path}/vectorizer.pkl')
    content_features = joblib.load(f'{folder_path}/content_features.pkl')
    video_df = joblib.load(f'{folder_path}/video_recommedation_dataset.pkl')

    print(f"Models and data successfully loaded")
    return vectorizer, content_features, video_df

In [81]:
vectorizer, content_features, video = load_video_recommendation_model()

Models and data successfully loaded
