In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter

In [2]:
df = pd.read_csv('clean-dataset.csv')
df.head()

Unnamed: 0,mood,activities
0,good,unsure | bored | bad sleep | movies | gaming |...
1,good,happy | excited | bad sleep
2,cool,family | cleaning
3,cool,family | friends | movies | games | coding
4,good,family | movies | games | cleaning | coding


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   mood        666 non-null    object
 1   activities  666 non-null    object
dtypes: object(2)
memory usage: 10.5+ KB


In [4]:
def custom_tokenizer(text):
    tokens = re.split(r'[^a-zA-Z0-9]', text)
    tokens = [token for token in tokens if token.strip() != '']
    return tokens

In [5]:
df['processed_activities'] = df['activities'].str.replace('|', ' ', regex=False)

In [6]:
df

Unnamed: 0,mood,activities,processed_activities
0,good,unsure | bored | bad sleep | movies | gaming |...,unsure bored bad sleep movies gaming ...
1,good,happy | excited | bad sleep,happy excited bad sleep
2,cool,family | cleaning,family cleaning
3,cool,family | friends | movies | games | coding,family friends movies games coding
4,good,family | movies | games | cleaning | coding,family movies games cleaning coding
...,...,...,...
661,good,dota 2 | watching series | youtube | travel |...,dota 2 watching series youtube travel ...
662,amazing,reading | shower | good meal | youtube | new t...,reading shower good meal youtube new t...
663,good,fasting | walk | meditation | youtube,fasting walk meditation youtube
664,good,write dairy | walk | meditation | audio book...,write dairy walk meditation audio book...


In [7]:
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
tfidf_matrix = vectorizer.fit_transform(df['processed_activities'])



In [8]:
vectorizer

In [9]:
tfidf_matrix.shape

(666, 113)

In [10]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.34906131, 0.        , ..., 0.        , 0.        ,
        0.15209292],
       [0.34906131, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.29254802,
        0.10909764],
       [0.        , 0.        , 0.        , ..., 0.29254802, 1.        ,
        0.18163581],
       [0.15209292, 0.        , 0.        , ..., 0.10909764, 0.18163581,
        1.        ]])

In [11]:
score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
score


array([[1.        , 0.34906131, 0.        , 0.07783432, 0.0863158 ,
        0.11287285, 0.        , 0.40503707, 0.        , 0.08689276,
        0.18793958, 0.0421614 , 0.06116771, 0.04229868, 0.05145596,
        0.08696932, 0.        , 0.        , 0.12333974, 0.43963181,
        0.12566997, 0.        , 0.40716061, 0.13525777, 0.22487187,
        0.1183689 , 0.48990884, 0.45425755, 0.33483566, 0.15667549,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.06334251, 0.        , 0.        ,
        0.06272973, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

def recommend_activities(input_activities, df, vectorizer, tfidf_matrix, top_n=5):

    input_vector = vectorizer.transform([input_activities])
    input_cosine_sim = cosine_similarity(input_vector, tfidf_matrix)

    sim_scores = list(enumerate(input_cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


    sim_scores = sim_scores[1:top_n + 1]
    activity_indices = [i[0] for i in sim_scores]

    similar_activities_df = df.iloc[activity_indices][['activities']].copy()

    print(similar_activities_df)

    similar_activities_df['activities_list'] = similar_activities_df['activities'].apply(
        lambda x: [activity.strip() for activity in x.split('|')]
    )

    all_activities = [activity for sublist in similar_activities_df['activities_list'] for activity in sublist]
    
    input_activities_list = [activity.strip() for activity in input_activities.split('|')]
    filtered_activities = [activity for activity in all_activities if activity not in input_activities_list]

    activity_counts = Counter(filtered_activities)

    sorted_activities = sorted(activity_counts.keys(), key=lambda x: activity_counts[x], reverse=True)

    return sorted_activities


In [17]:
input_activities = "walking | fasting | youtube"
result = recommend_activities(input_activities, df, vectorizer, tfidf_matrix, top_n=5)
print(result)

                                            activities
663            fasting  | walk | meditation  | youtube
15   excited | fairly good sleep | drink water | wa...
574           prayer | fasting  | streaming  | youtube
10   grateful | relaxing | unsure | anxious | fairl...
14   excited | grateful | fairly good sleep | drink...
['fairly good sleep', 'drink water', 'reading', 'homemade food', 'family', 'kindness', 'doing the task', 'cleaning', 'studying', 'meditation', 'excited', 'movies', 'friends', 'focus', 'cloudy', 'grateful', 'listening', 'walk', 'rainy', 'prayer', 'streaming', 'relaxing', 'unsure', 'anxious', 'resting', 'shopping', 'clear', 'classes', 'homework', 'exams']


In [14]:
import pickle

# Simpan vectorizer
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

# Simpan tfidf_matrix
with open('tfidf_matrix.pkl', 'wb') as tfidf_file:
    pickle.dump(tfidf_matrix, tfidf_file)

# Simpan df
with open('dataset.pkl', 'wb') as dataset_file:
    pickle.dump(df, dataset_file)