In [17]:
library(tidyverse)
library(repr)
library(readxl)
library(dplyr)

In [13]:
data_tr <- read_excel(path="data/revised_data.xlsx", sheet="data.tr") #loading training data
data_te <- read_excel(path="data/revised_data.xlsx", sheet="data.te") #loading test data

# Attribute Information:
- `STG` (The degree of study time for goal object materails),
- `SCG` (The degree of repetition number of user for goal object materails)
- `STR` (The degree of study time of user for related objects with goal object)
- `LPR` (The exam performance of user for related objects with goal object)
- `PEG` (The exam performance of user for goal objects)
- `UNS` (The knowledge level of user)

In [14]:
data_tr <- mutate(data_tr, UNS = as_factor(UNS))
data_te <- mutate(data_te, UNS = as_factor(UNS))

In [15]:
head(data_tr)
head(data_te)

STG,SCG,STR,LPR,PEG,UNS
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
0.0,0.0,0.0,0.0,0.0,very_low
0.08,0.08,0.1,0.24,0.9,High
0.06,0.06,0.05,0.25,0.33,Low
0.1,0.1,0.15,0.65,0.3,Middle
0.08,0.08,0.08,0.98,0.24,Low
0.09,0.15,0.4,0.1,0.66,Middle


STG,SCG,STR,LPR,PEG,UNS
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
0.0,0.1,0.5,0.26,0.05,Very Low
0.05,0.05,0.55,0.6,0.14,Low
0.08,0.18,0.63,0.6,0.85,High
0.2,0.2,0.68,0.67,0.85,High
0.22,0.22,0.9,0.3,0.9,High
0.14,0.14,0.7,0.5,0.3,Low


In [20]:
nrow(data_tr)

In [28]:
# the number of observations in each class and the mean value for each column in each class
obs_count <- data_tr %>% 
             group_by(UNS) %>%
             summarise(n_rows = length(UNS), 
                       avg_STG = mean(STG), 
                       avg_STR = mean(STR), 
                       avg_LPR = mean(LPR), 
                       avg_PEG = mean(PEG))
obs_count

UNS,n_rows,avg_STG,avg_STR,avg_LPR,avg_PEG
<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
very_low,24,0.3057917,0.36625,0.35875,0.09083333
High,63,0.4216508,0.5016667,0.5012698,0.77253968
Low,83,0.3211446,0.4307229,0.4973494,0.23762651
Middle,88,0.3999773,0.5068182,0.3428409,0.54238636


In [32]:
# the means of the predictor variables you plan to use in your analysis
# STG, STR, LPR and PEG to predict UNS
mutated_data <- data_tr %>%
                mutate(exam_performance = (LPR + PEG)/2) %>%
                mutate(study_time = (STG + STR)/2)
head(mutated_data)
mean_tr <- mutated_data %>%
           select(-SCG, -UNS) %>%
           map_df(mean, na.rm  = TRUE)
mean_tr

STG,SCG,STR,LPR,PEG,UNS,exam_performance,study_time
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>
0.0,0.0,0.0,0.0,0.0,very_low,0.0,0.0
0.08,0.08,0.1,0.24,0.9,High,0.57,0.09
0.06,0.06,0.05,0.25,0.33,Low,0.29,0.055
0.1,0.1,0.15,0.65,0.3,Middle,0.475,0.125
0.08,0.08,0.08,0.98,0.24,Low,0.61,0.08
0.09,0.15,0.4,0.1,0.66,Middle,0.38,0.245


STG,STR,LPR,PEG,exam_performance,study_time
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.3711473,0.4680039,0.4327132,0.4585388,0.445626,0.4195756


In [33]:
# count the missing value
sum(is.na(data_tr))