In [1]:
# TODO: fix the calculate_user_performance_score function
# TODO: learn more about cosine_similarity and pearson correlation coefficient 
# TDOO: learn more about k-means

In [2]:
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import k_means

In [3]:
# load codeex quiz user's performance dataset
performance_df = pd.read_csv("codeex_quiz_performance.csv")

performance_df.columns = [
    "user_id", 
    "full_name", 
    "quiz_id", 
    "quiz_points", 
    "quiz_difficulty", 
    "time_limit", 
    "total_sub_quizzes", 
    "topic_id", 
    "topic_label", 
    "rewarded_points", 
    "completion_time", 
    "total_completed_sub_quizzes"
]

In [4]:
def calculate_user_performance_score(performance: pd.DataFrame) -> int:
    """
    "total_points_possible" refers to the maximum number of points that can be earned in a quiz. 
    It is the total number of points that the quiz is worth, regardless of how many points the 
    user actually earned.

    For example, if a quiz is worth 100 points and the user earns 90 points, then the total_points_possible 
    would still be 100. This is important to include in the performance score formula because it normalizes 
    the user's performance across quizzes that have different point values. It ensures that a user 
    who earns 90 points on a 100-point quiz is not penalized compared to a user who earns 90 points 
    on a 50-point quiz.
    """
    total_points_possible = 100 # that mean 10 sub quizzes maximum, for each 10 points
    
    time_factor = performance.time_limit / performance.completion_time
    completion_factor = performance.total_sub_quizzes / performance.total_completed_sub_quizzes
    completion_time_factor = performance.completion_time / performance.time_limit
    total_completed_sub_quizzes_factor = performance.total_completed_sub_quizzes / performance.total_sub_quizzes
    difficulity_rate = (performance.quiz_difficulty - 1) / 2
    rewarded_points_factor = performance.rewarded_points / performance.quiz_points
    
    return (performance.quiz_points *  difficulity_rate * time_factor *  completion_factor * rewarded_points_factor) /\
        (total_points_possible * performance.total_sub_quizzes * completion_time_factor * total_completed_sub_quizzes_factor) \
            * 100

In [5]:
# dataframe containing user quiz score
performance_df["scores"] = performance_df.apply(calculate_user_performance_score, axis=1)

In [6]:
# dataframe containing user topic scores
result = performance_df.groupby(['user_id', 'topic_id'])['scores'].mean()
topic_scores = pd.DataFrame(result).reset_index()

In [7]:
# topic perfomance dataframe
topic_performance = topic_scores.pivot(index='user_id', columns='topic_id', values='scores').fillna(0)

In [8]:
# users similarity matrix
users_similarity = cosine_similarity(topic_performance.values)

In [9]:
# finding the top similar users based on their performance score
kmeans = k_means(users_similarity, n_clusters=2, random_state=0, n_init="auto")