In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter

In [2]:
df = pd.read_csv('clean-dataset.csv')
df.head()

Unnamed: 0,mood,activities
0,good,bermain game | makan sehat | menonton film
1,good,merasa bahagia | merasa bersemangat
2,keren,bersih-bersih
3,keren,bertemu teman | menonton film | bermain game |...
4,baik,menonton film | bermain game | bersih-bersih |...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   mood        666 non-null    object
 1   activities  666 non-null    object
dtypes: object(2)
memory usage: 10.5+ KB


In [4]:
def custom_tokenizer(text):
    tokens = re.split(r'[^a-zA-Z0-9]', text)
    tokens = [token for token in tokens if token.strip() != '']
    return tokens

In [5]:
df['processed_activities'] = df['activities'].str.replace('|', ' ', regex=False)

In [6]:
df

Unnamed: 0,mood,activities,processed_activities
0,good,bermain game | makan sehat | menonton film,bermain game makan sehat menonton film
1,good,merasa bahagia | merasa bersemangat,merasa bahagia merasa bersemangat
2,keren,bersih-bersih,bersih-bersih
3,keren,bertemu teman | menonton film | bermain game |...,bertemu teman menonton film bermain game ...
4,baik,menonton film | bermain game | bersih-bersih |...,menonton film bermain game bersih-bersih ...
...,...,...,...
661,Baik,Dota 2 | menonton seri | youtube | berpergian ...,Dota 2 menonton seri youtube berpergian ...
662,Luar biasa,membaca | mandi | makan enak | youtube | hal b...,membaca mandi makan enak youtube hal b...
663,Baik,puasa | berjalan | meditasi | youtube,puasa berjalan meditasi youtube
664,Baik,menulis jurnal | berjalan | meditasi | mendeng...,menulis jurnal berjalan meditasi mendeng...


In [7]:
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
tfidf_matrix = vectorizer.fit_transform(df['processed_activities'])



In [8]:
vectorizer

In [9]:
tfidf_matrix.shape

(666, 181)

In [10]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.0855867 ,
        0.39178289],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.3030346 ,
        0.07255254],
       [0.0855867 , 0.        , 0.        , ..., 0.3030346 , 1.        ,
        0.23898703],
       [0.39178289, 0.        , 0.        , ..., 0.07255254, 0.23898703,
        1.        ]])

In [11]:
score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
score


array([[1.        , 0.        , 0.        , 0.45264591, 0.38852734,
        0.16416999, 0.45281041, 0.        , 0.47251158, 0.06940008,
        0.0734379 , 0.        , 0.        , 0.        , 0.        ,
        0.07611084, 0.34537058, 0.        , 0.22324054, 0.60209368,
        0.24513037, 0.        , 0.40492071, 0.19149704, 0.44547583,
        0.25146378, 0.65075216, 0.31275909, 0.38411956, 0.        ,
        0.        , 0.        , 0.        , 0.2506942 , 0.25981891,
        0.25587784, 0.24411315, 0.23860496, 0.23403071, 0.2191366 ,
        0.24072242, 0.2310956 , 0.2467577 , 0.19680906, 0.01783136,
        0.24153232, 0.01932404, 0.18486191, 0.23877849, 0.01947177,
        0.01902178, 0.02135286, 0.14560311, 0.01859494, 0.02017201,
        0.13060567, 0.02033481, 0.02011142, 0.02162692, 0.01872558,
        0.02076044, 0.02032689, 0.0213498 , 0.01972303, 0.01835694,
        0.02078926, 0.02270751, 0.02184698, 0.02198738, 0.01991937,
        0.01758327, 0.01633967, 0.01725772, 0.01

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

def recommend_activities(input_activities, df, vectorizer, tfidf_matrix, top_n=5):

    input_vector = vectorizer.transform([input_activities])
    input_cosine_sim = cosine_similarity(input_vector, tfidf_matrix)

    sim_scores = list(enumerate(input_cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


    sim_scores = sim_scores[1:top_n + 1]
    activity_indices = [i[0] for i in sim_scores]

    similar_activities_df = df.iloc[activity_indices][['activities']].copy()

    print(similar_activities_df)

    similar_activities_df['activities_list'] = similar_activities_df['activities'].apply(
        lambda x: [activity.strip() for activity in x.split('|')]
    )

    all_activities = [activity for sublist in similar_activities_df['activities_list'] for activity in sublist]
    
    input_activities_list = [activity.strip() for activity in input_activities.split('|')]
    filtered_activities = [activity for activity in all_activities if activity not in input_activities_list]

    activity_counts = Counter(filtered_activities)

    sorted_activities = sorted(activity_counts.keys(), key=lambda x: activity_counts[x], reverse=True)

    return sorted_activities


In [19]:
input_activities = "main game | berjalan"
result = recommend_activities(input_activities, df, vectorizer, tfidf_matrix, top_n=5)
print(result)

                                            activities
6                      membaca | bermain game | coding
534  streaming | youtube | bermain game | hal baru ...
458  mendengarkan podcast | memperbarui berita | st...
531  mendengarkan podcast | streaming | youtube | b...
453  meditasi | mendengarkan podcast | memperbarui ...
['bermain game', 'youtube', 'streaming', 'mendengarkan podcast', 'memperbarui berita', 'membaca', 'coding', 'hal baru', 'tidur siang', 'berkirim surat', 'meditasi', 'makan enak']


In [20]:
import pickle

# Simpan vectorizer
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

# Simpan tfidf_matrix
with open('tfidf_matrix.pkl', 'wb') as tfidf_file:
    pickle.dump(tfidf_matrix, tfidf_file)

# Simpan df
with open('dataset.pkl', 'wb') as dataset_file:
    pickle.dump(df, dataset_file)