In [14]:
# TODO: learn more about cosine_similarity and pearson correlation coefficient 

In [4]:
import numpy as np
import pandas as pd
from typing import List
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
# load codeex quiz user's performance dataset
performance_df = pd.read_csv("codeex_quiz_performance.csv")

performance_df.columns = [
    "user_id", 
    "full_name", 
    "quiz_id", 
    "quiz_points", 
    "quiz_difficulty", 
    "time_limit", 
    "total_sub_quizzes", 
    "topic_id", 
    "topic_label", 
    "rewarded_points", 
    "completion_time", 
    "total_completed_sub_quizzes"
]

Unnamed: 0,user_id,full_name,quiz_id,quiz_points,quiz_difficulty,time_limit,total_sub_quizzes,topic_id,topic_label,rewarded_points,completion_time,total_completed_sub_quizzes
0,CQU0001,John Doe,CQQ0001,50,2,200,10,CQT0001,Software Engineering,20,60,5
1,CQU0002,Jane Doe,CQQ0002,90,3,300,8,CQT0002,Web Development,80,140,7
2,CQU0003,Michael Smith,CQQ0003,100,1,120,6,CQT0001,Software Engineering,50,24,2
3,CQU0004,Alice Johnson,CQQ0004,30,2,80,4,CQT0003,Programming,0,24,1
4,CQU0005,Robert Brown,CQQ0005,70,3,180,9,CQT0002,Web Development,35,54,6
...,...,...,...,...,...,...,...,...,...,...,...,...
106,CQU0002,Jane Doe,CQQ0108,70,2,200,10,CQT0001,Software Engineering,35,80,5
107,CQU0007,David Kim,CQQ0109,80,3,300,8,CQT0002,Web Development,40,140,7
108,CQU0013,Emma Davis,CQQ0110,70,1,80,4,CQT0003,Programming,35,16,2
109,CQU0001,John Doe,CQQ0111,50,2,200,10,CQT0001,Software Engineering,25,80,5


In [6]:
def calculate_user_performance_score(performance: pd.DataFrame) -> int:
    """
    "total_points_possible" refers to the maximum number of points that can be earned in a quiz. 
    It is the total number of points that the quiz is worth, regardless of how many points the 
    user actually earned.

    For example, if a quiz is worth 100 points and the user earns 90 points, then the total_points_possible 
    would still be 100. This is important to include in the performance score formula because it normalizes 
    the user's performance across quizzes that have different point values. It ensures that a user 
    who earns 90 points on a 100-point quiz is not penalized compared to a user who earns 90 points 
    on a 50-point quiz.
    """
    total_points_possible = 100 # that mean 10 sub quizzes maximum, for each 10 points
    
    time_factor = performance.time_limit / performance.completion_time
    completion_factor = performance.total_sub_quizzes / performance.total_completed_sub_quizzes
    completion_time_factor = performance.completion_time / performance.time_limit
    total_completed_sub_quizzes_factor = performance.total_completed_sub_quizzes / performance.total_sub_quizzes
    difficulity_rate = (performance.quiz_difficulty - 1) / 2
    rewarded_points_factor = performance.rewarded_points / performance.quiz_points
    
    return (performance.quiz_points *  difficulity_rate * time_factor *  completion_factor * rewarded_points_factor) /\
        (total_points_possible * performance.total_sub_quizzes * completion_time_factor * total_completed_sub_quizzes_factor) \
            * 100

In [7]:
# dataframe containing user quiz score
performance_df["scores"] = performance_df.apply(calculate_user_performance_score, axis=1)

In [54]:
# dataframe containing user topic scores
result = performance_df.groupby(['user_id', 'topic_id'])['scores'].mean()
topic_scores = pd.DataFrame(result).reset_index()

In [52]:
# topic perfomance dataframe
topic_performance = topic_scores.pivot(index='user_id', columns='topic_id', values='scores').fillna(0)

In [50]:
# users similarity matrix
users_similarity = cosine_similarity(topic_performance.values)

In [56]:
# get important neighbors_ids for a given user
def get_important_neighbors_ids(user_similarity: np.ndarray, users_id: List[str]) -> List[str]:
    important_neighbors_count = 10
    target_user_neighbors = pd.Series(user_similarity, index=users_id)
    target_user_neighbors_sorted = target_user_neighbors.sort_values(ascending=False)
    nearest_neighbors = target_user_neighbors_sorted[:important_neighbors_count]
    neighbors_ids = nearest_neighbors.index
    return neighbors_ids

In [57]:
# a dataframe that holds all the nearest similar users for all users
users_id = topic_performance.index
users_important_neighbors_df = pd.DataFrame([
    get_important_neighbors_ids(user_similarity, users_id) 
    for user_similarity in users_similarity
], index=users_id)

In [92]:
quizzes_to_suggest_for_users = np.array([
    set(performance_df.loc[
        performance_df["user_id"].isin(user_important_neighbors),
        "quiz_id"
    ].values)
    for user_important_neighbors in users_important_neighbors_df.values
])

prin
, users_id

(array([{'CQQ0006', 'CQQ0051', 'CQQ0082', 'CQQ0052', 'CQQ0061', 'CQQ0102', 'CQQ0071', 'CQQ0042', 'CQQ0091', 'CQQ0111', 'CQQ0110', 'CQQ0072', 'CQQ0101', 'CQQ0092', 'CQQ0032', 'CQQ0097', 'CQQ0016', 'CQQ0062', 'CQQ0001', 'CQQ0031', 'CQQ0081', 'CQQ0026', 'CQQ0041'},
        {'CQQ0006', 'CQQ0051', 'CQQ0082', 'CQQ0002', 'CQQ0018', 'CQQ0099', 'CQQ0061', 'CQQ0024', 'CQQ0089', 'CQQ0102', 'CQQ0059', 'CQQ0005', 'CQQ0076', 'CQQ0035', 'CQQ0071', 'CQQ0047', 'CQQ0087', 'CQQ0094', 'CQQ0011', 'CQQ0056', 'CQQ0091', 'CQQ0074', 'CQQ0108', 'CQQ0012', 'CQQ0072', 'CQQ0101', 'CQQ0054', 'CQQ0110', 'CQQ0092', 'CQQ0111', 'CQQ0032', 'CQQ0019', 'CQQ0016', 'CQQ0029', 'CQQ0049', 'CQQ0030', 'CQQ0001', 'CQQ0037', 'CQQ0062', 'CQQ0014', 'CQQ0031', 'CQQ0067', 'CQQ0069', 'CQQ0081', 'CQQ0039', 'CQQ0041', 'CQQ0079', 'CQQ0007', 'CQQ0096', 'CQQ0106'},
        {'CQQ0027', 'CQQ0002', 'CQQ0018', 'CQQ0046', 'CQQ0073', 'CQQ0025', 'CQQ0013', 'CQQ0099', 'CQQ0024', 'CQQ0040', 'CQQ0017', 'CQQ0066', 'CQQ0077', 'CQQ0080', 'CQQ0090', 'CQ