In [92]:
import pandas as pd

## Preparing Recommendation dataset using DBE-KT22


In [71]:
df = pd.read_csv("./data/Transaction.csv")
kc_df = pd.read_csv("./data/Question_KC_Relationships.csv")

# Bringing in Kc_id
df = df.merge(kc_df, on='question_id', how='left')
# Drop redundant columns
df = df.drop(columns=['id_x', 'id_y', 'selection_change', 'difficulty_feedback', 'answer_text', 'hint_used', 'answer_choice_id', 'is_hidden'])

df.head(5)



Unnamed: 0,start_time,end_time,trust_feedback,answer_state,student_id,question_id,knowledgecomponent_id
0,2019-08-07 17:12:08.722 -0700,2019-08-07 17:12:08.721 -0700,3,True,5,36,14
1,2019-08-10 08:28:12.116 -0700,2019-08-10 08:28:12.116 -0700,1,False,5,37,15
2,2019-08-10 08:33:03.479 -0700,2019-08-10 08:33:03.478 -0700,1,True,5,2,4
3,2019-08-10 08:40:25.411 -0700,2019-08-10 08:40:25.411 -0700,2,True,5,5,6
4,2019-08-10 08:51:39.062 -0700,2019-08-10 08:51:39.062 -0700,2,False,5,3,4


## Pre-processing


In [73]:
# Removing rows where confidence was not provided
df = df[df['trust_feedback'] != 0]

# Replacing True with 1 and False with 0 in answer_state
df['answer_state'] = df['answer_state'].replace({True: 1, False: 0})

# Since same question_id can be associated to more than one knowledgecomponent_id, we need to assign each question to a single knowledgecomponent_id
def replace_with_highest_knowledgecomponent_id(group):
    max_knowledgecomponent_id = group['knowledgecomponent_id'].max()
    group['knowledgecomponent_id'] = max_knowledgecomponent_id
    return group

df = df.groupby('question_id').apply(replace_with_highest_knowledgecomponent_id).reset_index(drop=True)

# Ordering the rows in proper sequence of how student solved the question
df['start_time'] = pd.to_datetime(df['start_time'], utc=True)
df = df.sort_values(by=['student_id', 'knowledgecomponent_id', 'start_time']).reset_index(drop=True)

df = df.drop_duplicates()
print(len(df))
df.head(10)
# temp_df = df[df['student_id'] == 12]
# print(len(temp_df))
# df.head(50)

24436


  df = df.groupby('question_id').apply(replace_with_highest_knowledgecomponent_id).reset_index(drop=True)


Unnamed: 0,start_time,end_time,trust_feedback,answer_state,student_id,question_id,knowledgecomponent_id
0,2019-08-07 15:06:07.153000+00:00,2019-08-07 08:06:07.153 -0700,2,1,1,3,4
1,2019-08-07 15:06:30.604000+00:00,2019-08-07 08:06:30.603 -0700,1,0,1,8,6
2,2019-08-11 12:54:31.136000+00:00,2019-08-11 05:54:39.512 -0700,3,0,1,6,7
3,2019-08-12 13:22:40.228000+00:00,2019-08-12 06:23:02.164 -0700,2,1,1,7,8
4,2019-08-12 13:24:31.121000+00:00,2019-08-12 06:24:58.485 -0700,1,0,1,9,11
5,2019-08-20 13:14:33.050000+00:00,2019-08-20 06:14:50.244 -0700,3,0,1,15,12
7,2021-11-27 07:27:46.544000+00:00,2021-11-26 23:28:09.871 -0800,2,0,1,199,94
10,2019-09-21 15:12:32.826000+00:00,2019-09-21 08:13:06.139 -0700,1,0,2,159,76
12,2019-08-10 15:33:03.479000+00:00,2019-08-10 08:33:03.478 -0700,1,1,5,2,4
13,2019-08-10 15:51:39.062000+00:00,2019-08-10 08:51:39.062 -0700,2,0,5,3,4


In [74]:
# Counting number of unique students, skill, q_id and confidence
unique_count_student = df['student_id'].nunique()
unique_count_kc = df['knowledgecomponent_id'].nunique()
unique_count_question = df['question_id'].nunique()
unique_count_confidence = df['trust_feedback'].nunique()

print(f"Number of unique students: {unique_count_student}")
print(f"Number of unique kcs: {unique_count_kc}")
print(f"Number of unique questions: {unique_count_question}")
print(f"Number of unique confidence: {unique_count_confidence}")


Number of unique students: 897
Number of unique kcs: 83
Number of unique questions: 212
Number of unique confidence: 3


#### Evaluating KMA and KMB for each row

In [75]:
def calculate_kma(confidence_correctness):

    kma_list = []
    
    for i in range(0, len(confidence_correctness)):
        FC = PC = FI = 0
        temp_list = confidence_correctness[:i+1]

        # Calculate counts for each question up to n
        for idx, c_c in enumerate(temp_list):
            if c_c[1] == 1:
                ans = 1
            elif c_c[1] == 0:
                ans = 0
            else:
                ans = 0.5 
            # ans = c_c[1]
            conf = c_c[0]
            
            if conf == 2:
                conf = 'P'
            elif conf == 3:
                conf = 'C'
            else:
                conf = 'I'
                
            if ans == 1:
                if conf == 'C':
                    FC+=1
                elif conf == 'P':
                    PC+=1
                else:
                    FI+=1
            elif ans == 0.5:
                if conf == 'C':
                    PC+=1
                elif conf == 'P':
                    FC+=1
                else:
                    PC+=1
            else:
                if conf == 'C':
                    FI+=1
                elif conf == 'P':
                    PC+=1
                else:
                    FC+=1
                    
        kma = round((FC - 0.5 * PC - FI) / (FC+PC+FI), 2)
        kma_list.append(kma)

    return kma_list

In [76]:
def calculate_kmb(confidence_correctness):
    kmb_list = []
    
    for i in range(0, len(confidence_correctness)):
        NB = PPB = FPB = POB = FOB = 0
        temp_list = confidence_correctness[:i+1]

        # Calculate counts for each question up to n
        for c_c in temp_list:
            if c_c[1] == 1:
                ans = 1
            elif c_c[1] == 0:
                ans = 0
            else:
                ans = 0.5 
            # ans = idx[1]
            conf = c_c[0]
            
            if conf == 2:
                conf = 'P'
            elif conf == 3:
                conf = 'C'
            else:
                conf = 'I'
                
                
            if ans == 1:
                if conf == 'C':
                    NB+=1
                elif conf == 'P':
                    PPB+=1
                else:
                    FPB+=1
            elif ans == 0.5:
                if conf == 'C':
                    POB+=1
                elif conf == 'P':
                    NB+=1
                else:
                    PPB+=1
            else:
                if conf == 'C':
                    FOB+=1
                elif conf == 'P':
                    POB+=1
                else:
                    NB+=1
                
        # Calculate KMB
        kmb = round((FOB + 0.5 * (POB - PPB) - FPB) / (FOB + POB + NB + PPB + FPB), 2)
        kmb_list.append(kmb)
    return kmb_list

In [77]:
grouped = df.groupby(['student_id', 'knowledgecomponent_id'])

# Create a list of DataFrames for each group
dfs = [group for _, group in grouped]

largest_df = None
largest_size = 0
largest_df_index = -1

for i, df in enumerate(dfs):
    current_size = len(df)
    if current_size > largest_size:
        largest_size = current_size
        largest_df = df
        largest_df_index = i

print(f"The largest DataFrame is at index {largest_df_index} and has {largest_size} rows.")

The largest DataFrame is at index 10290 and has 18 rows.


In [82]:
# Iterating over each df from dfs to calculate KMA and KMB

processed_dfs = []
step = 0
for df_i in dfs:
    step += 1
    if step % 50 == 0:
        print(f"Processed {step} dataframes")
    confidence_correctness = list(zip(df_i['trust_feedback'], df_i['answer_state']))
    kma = calculate_kma(confidence_correctness)
    kmb = calculate_kmb(confidence_correctness)
    df_i['kma'] = kma
    df_i['kmb'] = kmb
    # Extracting the last row because I only need the final KMA and KMB for questions answered by the same student for the same skill
    last_row = df_i.tail(1)
    processed_dfs.append(last_row)


Processed 50 dataframes
Processed 100 dataframes
Processed 150 dataframes
Processed 200 dataframes
Processed 250 dataframes
Processed 300 dataframes
Processed 350 dataframes
Processed 400 dataframes
Processed 450 dataframes
Processed 500 dataframes
Processed 550 dataframes
Processed 600 dataframes
Processed 650 dataframes
Processed 700 dataframes
Processed 750 dataframes
Processed 800 dataframes
Processed 850 dataframes
Processed 900 dataframes
Processed 950 dataframes
Processed 1000 dataframes
Processed 1050 dataframes
Processed 1100 dataframes
Processed 1150 dataframes
Processed 1200 dataframes
Processed 1250 dataframes
Processed 1300 dataframes
Processed 1350 dataframes
Processed 1400 dataframes
Processed 1450 dataframes
Processed 1500 dataframes
Processed 1550 dataframes
Processed 1600 dataframes
Processed 1650 dataframes
Processed 1700 dataframes
Processed 1750 dataframes
Processed 1800 dataframes
Processed 1850 dataframes
Processed 1900 dataframes
Processed 1950 dataframes
Proces

In [83]:
df_kma_kmb = pd.concat(processed_dfs, ignore_index=True)
df_kma_kmb.head(10)

Unnamed: 0,start_time,end_time,trust_feedback,answer_state,student_id,question_id,knowledgecomponent_id,kma,kmb
0,2019-08-07 15:06:07.153000+00:00,2019-08-07 08:06:07.153 -0700,2,1,1,3,4,-0.5,-0.5
1,2019-08-07 15:06:30.604000+00:00,2019-08-07 08:06:30.603 -0700,1,0,1,8,6,1.0,0.0
2,2019-08-11 12:54:31.136000+00:00,2019-08-11 05:54:39.512 -0700,3,0,1,6,7,-1.0,1.0
3,2019-08-12 13:22:40.228000+00:00,2019-08-12 06:23:02.164 -0700,2,1,1,7,8,-0.5,-0.5
4,2019-08-12 13:24:31.121000+00:00,2019-08-12 06:24:58.485 -0700,1,0,1,9,11,1.0,0.0
5,2019-08-20 13:14:33.050000+00:00,2019-08-20 06:14:50.244 -0700,3,0,1,15,12,-1.0,1.0
6,2021-11-27 07:27:46.544000+00:00,2021-11-26 23:28:09.871 -0800,2,0,1,199,94,-0.5,0.5
7,2019-09-21 15:12:32.826000+00:00,2019-09-21 08:13:06.139 -0700,1,0,2,159,76,1.0,0.0
8,2019-08-10 15:51:39.062000+00:00,2019-08-10 08:51:39.062 -0700,2,0,5,3,4,-0.75,-0.25
9,2019-08-10 15:40:25.411000+00:00,2019-08-10 08:40:25.411 -0700,2,1,5,5,6,-0.5,-0.5


### Calculate rating from KMA and KMB

In [86]:
def calculate_rating(row):
    """ 
    This function transoforms the KMA and KMB value into rating, in order to make the dataset suitable for collaborative filtering
    """
    awareness_norm = (row['kma'] + 1) / 2
    outlook_norm = (row['kmb'] + 1) / 2
    rating = 1 + 4 * ((awareness_norm + outlook_norm) / 2)
    return round(rating, 2)


In [87]:
df_kma_kmb['rating'] = df_kma_kmb.apply(calculate_rating, axis=1)
df_kma_kmb.head(10)

Unnamed: 0,start_time,end_time,trust_feedback,answer_state,student_id,question_id,knowledgecomponent_id,kma,kmb,rating
0,2019-08-07 15:06:07.153000+00:00,2019-08-07 08:06:07.153 -0700,2,1,1,3,4,-0.5,-0.5,2.0
1,2019-08-07 15:06:30.604000+00:00,2019-08-07 08:06:30.603 -0700,1,0,1,8,6,1.0,0.0,4.0
2,2019-08-11 12:54:31.136000+00:00,2019-08-11 05:54:39.512 -0700,3,0,1,6,7,-1.0,1.0,3.0
3,2019-08-12 13:22:40.228000+00:00,2019-08-12 06:23:02.164 -0700,2,1,1,7,8,-0.5,-0.5,2.0
4,2019-08-12 13:24:31.121000+00:00,2019-08-12 06:24:58.485 -0700,1,0,1,9,11,1.0,0.0,4.0
5,2019-08-20 13:14:33.050000+00:00,2019-08-20 06:14:50.244 -0700,3,0,1,15,12,-1.0,1.0,3.0
6,2021-11-27 07:27:46.544000+00:00,2021-11-26 23:28:09.871 -0800,2,0,1,199,94,-0.5,0.5,3.0
7,2019-09-21 15:12:32.826000+00:00,2019-09-21 08:13:06.139 -0700,1,0,2,159,76,1.0,0.0,4.0
8,2019-08-10 15:51:39.062000+00:00,2019-08-10 08:51:39.062 -0700,2,0,5,3,4,-0.75,-0.25,2.0
9,2019-08-10 15:40:25.411000+00:00,2019-08-10 08:40:25.411 -0700,2,1,5,5,6,-0.5,-0.5,2.0


In [88]:
final_df = df_kma_kmb.drop(columns=['start_time', 'end_time', 'trust_feedback', 'answer_state', 'question_id', 'kma', 'kmb'])
final_df.head(10)

Unnamed: 0,student_id,knowledgecomponent_id,rating
0,1,4,2.0
1,1,6,4.0
2,1,7,3.0
3,1,8,2.0
4,1,11,4.0
5,1,12,3.0
6,1,94,3.0
7,2,76,4.0
8,5,4,2.0
9,5,6,2.0


In [89]:
final_df.to_csv('./data/DBE-KT22-RecDataset.csv', index=False)

In [91]:
# Counting number of unique students, skill, q_id and confidence
unique_count_student = final_df['student_id'].nunique()
unique_count_kc = final_df['knowledgecomponent_id'].nunique()

print(f"Number of unique students: {unique_count_student}")
print(f"Number of unique kcs: {unique_count_kc}")



Number of unique students: 897
Number of unique kcs: 83
