In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter

In [2]:
dataf = pd.read_csv('Daylio_Abid.csv')
dataf.head()

Unnamed: 0,full_date,date,weekday,time,sub_mood,activities,mood
0,16/04/2021,Apr-16,Friday,8:00 pm,yolo,reading | Art | prayer | fasting | walk | med...,Good
1,15/04/2021,Apr-15,Thursday,2:37 am,focused,reading | learning | Art | prayer | fasting ...,Good
2,14/04/2021,Apr-14,Wednesday,2:39 am,confused,reading | learning | prayer | fasting | Qura...,Normal
3,13/04/2021,Apr-13,Tuesday,2:38 am,wondering,reading | learning | Art | prayer | fasting ...,Normal
4,12/04/2021,Apr-12,Monday,9:52 pm,angry,reading | learning | fasting | walk | medita...,Awful


In [3]:
dataf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   full_date   940 non-null    object
 1   date        940 non-null    object
 2   weekday     940 non-null    object
 3   time        940 non-null    object
 4   sub_mood    940 non-null    object
 5   activities  893 non-null    object
 6   mood        940 non-null    object
dtypes: object(7)
memory usage: 51.5+ KB


In [4]:
dataf = dataf.drop(['full_date', 'date', 'weekday', 'time', 'sub_mood'], axis=1)

In [5]:
dataf.head()

Unnamed: 0,activities,mood
0,reading | Art | prayer | fasting | walk | med...,Good
1,reading | learning | Art | prayer | fasting ...,Good
2,reading | learning | prayer | fasting | Qura...,Normal
3,reading | learning | Art | prayer | fasting ...,Normal
4,reading | learning | fasting | walk | medita...,Awful


In [6]:
dataFrame = dataf[dataf['mood'].isin(['Good', 'Amazing', 'Normal'])]

In [7]:
dataFrame.head()

Unnamed: 0,activities,mood
0,reading | Art | prayer | fasting | walk | med...,Good
1,reading | learning | Art | prayer | fasting ...,Good
2,reading | learning | prayer | fasting | Qura...,Normal
3,reading | learning | Art | prayer | fasting ...,Normal
5,reading | learning | prayer | fasting | walk...,Normal


In [8]:
dataFrame.isnull().sum()

Unnamed: 0,0
activities,41
mood,0


In [9]:
dataFrame = dataFrame.dropna()
dataFrame.isnull().sum()


Unnamed: 0,0
activities,0
mood,0


In [10]:
dataFrame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 799 entries, 0 to 939
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   activities  799 non-null    object
 1   mood        799 non-null    object
dtypes: object(2)
memory usage: 18.7+ KB


In [11]:
def custom_tokenizer(text):
    tokens = re.split(r'[^a-zA-Z0-9]', text)
    tokens = [token for token in tokens if token.strip() != '']
    return tokens

In [12]:
data = dataFrame
df = pd.DataFrame(data)

In [13]:
df['processed_activities'] = df['activities'].str.replace('|', ' ', regex=False)

In [14]:
df.head()

Unnamed: 0,activities,mood,processed_activities
0,reading | Art | prayer | fasting | walk | med...,Good,reading Art prayer fasting walk med...
1,reading | learning | Art | prayer | fasting ...,Good,reading learning Art prayer fasting ...
2,reading | learning | prayer | fasting | Qura...,Normal,reading learning prayer fasting Qura...
3,reading | learning | Art | prayer | fasting ...,Normal,reading learning Art prayer fasting ...
5,reading | learning | prayer | fasting | walk...,Normal,reading learning prayer fasting walk...


In [15]:
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
tfidf_matrix = vectorizer.fit_transform(df['processed_activities'])



In [16]:
vectorizer

In [17]:
tfidf_matrix.shape

(799, 67)

In [18]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.74519702, 0.62974863, ..., 0.11476642, 0.184265  ,
        0.42048442],
       [0.74519702, 1.        , 0.79086985, ..., 0.2295942 , 0.15388007,
        0.27598877],
       [0.62974863, 0.79086985, 1.        , ..., 0.2437595 , 0.16337403,
        0.29301648],
       ...,
       [0.11476642, 0.2295942 , 0.2437595 , ..., 1.        , 0.10311881,
        0.34181165],
       [0.184265  , 0.15388007, 0.16337403, ..., 0.10311881, 1.        ,
        0.        ],
       [0.42048442, 0.27598877, 0.29301648, ..., 0.34181165, 0.        ,
        1.        ]])

In [20]:
score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
score


array([[1.        , 0.74519702, 0.62974863, 0.72457155, 0.79109155,
        0.83704463, 0.84679278, 0.82479792, 0.68282785, 0.66771408,
        0.68262609, 0.64129477, 0.86408004, 0.84102724, 0.84273277,
        0.67930875, 0.67046865, 0.82481574, 0.65329939, 0.82766742,
        0.79807168, 0.76128278, 0.86847393, 0.8368297 , 0.72204176,
        0.7694658 , 0.76453469, 0.72052064, 0.73339535, 0.67935311,
        0.73853048, 0.66537552, 0.74259372, 0.61636963, 0.76457174,
        0.72611841, 0.82302724, 0.693842  , 0.73634888, 0.74516012,
        0.80457702, 0.54676089, 0.76629436, 0.76092186, 0.74097777,
        0.7544539 , 0.5599584 , 0.61254794, 0.70357718, 0.63815535,
        0.74629703, 0.75307901, 0.62734706, 0.75397755, 0.8128032 ,
        0.79830729, 0.73552865, 0.67613679, 0.73817793, 0.67176595,
        0.70045421, 0.66570583, 0.71584806, 0.68387089, 0.68875637,
        0.68309862, 0.64855765, 0.68793374, 0.6033543 , 0.79259598,
        0.69996159, 0.53387373, 0.71491909, 0.64

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

def recommend_activities(input_activities, df, vectorizer, tfidf_matrix, top_n=5):

    input_vector = vectorizer.transform([input_activities])
    input_cosine_sim = cosine_similarity(input_vector, tfidf_matrix)

    sim_scores = list(enumerate(input_cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


    sim_scores = sim_scores[1:top_n + 1]
    activity_indices = [i[0] for i in sim_scores]

    similar_activities_df = df.iloc[activity_indices][['activities']].copy()

    similar_activities_df['activities_list'] = similar_activities_df['activities'].apply(
        lambda x: [activity.strip() for activity in x.split('|')]
    )

    all_activities = [activity for sublist in similar_activities_df['activities_list'] for activity in sublist]
    input_activities_list = [activity.strip() for activity in input_activities.split('|')]
    filtered_activities = [activity for activity in all_activities if activity not in input_activities_list]

    activity_counts = Counter(filtered_activities)

    sorted_activities = sorted(activity_counts.keys(), key=lambda x: activity_counts[x], reverse=True)

    return sorted_activities


In [33]:
input_activities = "meditation | walk"
result = recommend_activities(input_activities, df, vectorizer, tfidf_matrix, top_n=5)
print(result)

['fasting', 'youtube', 'Audio books', 'News Update', 'power nap']
