# Project Final Report (Group)


## Introduction

- provide some relevant background information on the topic so that someone unfamiliar with it will be prepared to understand the rest of your report
- clearly state the question you tried to answer with your project
- identify and fully describe the dataset that was used to answer the question

## Methods & Results

- describe the methods you used to perform your analysis from beginning to end that narrates the analysis code.
- your report should include code which:
    - loads data 
    - wrangles and cleans the data to the format necessary for the planned analysis
    - performs a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 
    - creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis
    - performs the data analysis
    - creates a visualization of the analysis 
note: all figures should have a figure number and a legend


Can the the player type, age and played hours of players predict if they are going to subscribe to a game-related newsletter and which player type is the most predictive?

In [None]:
#Loading required libraries
library(repr)
library(tidyverse)
library(tidymodels)

In [None]:
#Read csv file
player <- read_csv("data/players.csv")
slice_head(player, n=5)

In [None]:
#Wrangle unessccary data and re-assign data types
tidy_player <- player |> 
    select(-hashedEmail, -gender, -name) |>
    mutate(experience = as_factor(experience), subscribe = as_factor(subscribe)) |>
    mutate(subscribe = recode(subscribe, "TRUE" = "Yes", "FALSE" = "No"))
slice_head(tidy_player, n=5)

In [None]:
#General Visualization
options(repr.plot.width = 10, repr.plot.height = 8) 
tidy_player_age_plot <- tidy_player |>
    ggplot(aes(x = Age, y = played_hours, color = subscribe)) +
    geom_point(alpha = 0.9) + 
	scale_x_log10() +
    scale_y_log10() +
    labs(x = "Age (years)", y = "Hours Played (hours)", color = "Subscribed?") +
    ggtitle("Hours Played vs. Age Relationship") +
    theme(text = element_text(size = 18))

tidy_players_experience_plot <- tidy_player |>
    ggplot(aes(y = experience, fill = subscribe)) +
    geom_bar(stat = "count") +
    labs(x = "Player Type", y = "Number of Players", fill = "Subscribed?") +
    ggtitle("Distribution of Players Across Experience and Subscription") +
    theme(text = element_text(size = 18))
tidy_player_age_plot
tidy_players_experience_plot

In [None]:
# For tuning
set.seed(4923)
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

In [None]:
#Filter for each player type and split between training and testing sets
set.seed(4923)
beginner_players <- tidy_player |> filter(experience == "Beginner")
beginner_split <- initial_split(beginner_players, prop = 0.75, strata = subscribe)
beginner_train <- training(beginner_split)
beginner_test <- testing(beginner_split)

regular_players <- tidy_player |> filter(experience == "Regular")
regular_split <- initial_split(regular_players, prop = 0.75, strata = subscribe)
regular_train <- training(regular_split)
regular_test <- testing(regular_split)

amateur_players <- tidy_player |> filter(experience == "Amateur")
amateur_split <- initial_split(amateur_players, prop = 0.75, strata = subscribe)
amateur_train <- training(amateur_split)
amateur_test <- testing(amateur_split)

veteran_players <- tidy_player |> filter(experience == "Veteran")
veteran_split <- initial_split(veteran_players, prop = 0.75, strata = subscribe)
veteran_train <- training(veteran_split)
veteran_test <- testing(veteran_split)

pro_players <- tidy_player |> filter(experience == "Pro")
pro_split <- initial_split(pro_players, prop = 0.75, strata = subscribe)
pro_train <- training(pro_split)
pro_test <- testing(pro_split)

In [None]:
# Create vfold for each player type
set.seed(4923)
beginner_vfold <- vfold_cv(beginner_train, v = 5, strata = subscribe)
regular_vfold <- vfold_cv(regular_train, v = 5, strata = subscribe)
amateur_vfold <- vfold_cv(amateur_train, v = 5, strata = subscribe)
veteran_vfold <- vfold_cv(veteran_train, v = 5, strata = subscribe)
pro_vfold <- vfold_cv(pro_train, v = 5, strata = subscribe)

In [None]:
# Tune for best k given proper range
set.seed(4923)
beginner_k_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))
beginner_recipe <- recipe(subscribe ~ played_hours + Age, data = beginner_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())
beginner_fit <- workflow() |>
  add_recipe(beginner_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = beginner_vfold, grid = beginner_k_vals)

regular_k_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))
regular_recipe <- recipe(subscribe ~ played_hours + Age, data = regular_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())
regular_fit <- workflow() |>
  add_recipe(regular_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = regular_vfold, grid = regular_k_vals)

amateur_k_vals <- tibble(neighbors = seq(from = 1, to = 20, by = 1))
amateur_recipe <- recipe(subscribe ~ played_hours + Age, data = amateur_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())
amateur_fit <- workflow() |>
  add_recipe(amateur_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = amateur_vfold, grid = amateur_k_vals)

veteran_k_vals <- tibble(neighbors = seq(from = 1, to = 22, by = 1))
veteran_recipe <- recipe(subscribe ~ played_hours + Age, data = veteran_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())
veteran_fit <- workflow() |>
  add_recipe(veteran_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = veteran_vfold, grid = veteran_k_vals)

pro_k_vals <- tibble(neighbors = seq(from = 1, to = 6, by = 1))
pro_recipe <- recipe(subscribe ~ played_hours + Age, data = pro_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())
pro_fit <- workflow() |>
  add_recipe(pro_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = pro_vfold, grid = pro_k_vals)

In [None]:
# Analyze for best k
set.seed(4923)
beginner_accuracies <- beginner_fit |> collect_metrics() |>
  filter(.metric == "accuracy")
beginner_cross_val_plot <- ggplot(beginner_accuracies, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Neighbors", y = "Accuracy Estimate", title = "Beginner Accuracies") +
    theme(text = element_text(size = 12))

regular_accuracies <- regular_fit |> collect_metrics() |>
  filter(.metric == "accuracy")
regular_cross_val_plot <- ggplot(regular_accuracies, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Neighbors", y = "Accuracy Estimate", title = "Regular Accuracies") +
    theme(text = element_text(size = 12))

amateur_accuracies <- amateur_fit |> collect_metrics() |>
  filter(.metric == "accuracy")
amateur_cross_val_plot <- ggplot(amateur_accuracies, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Neighbors", y = "Accuracy Estimate", title = "Amateur Accuracies") +
    theme(text = element_text(size = 12))

veteran_accuracies <- veteran_fit |> collect_metrics() |>
  filter(.metric == "accuracy")
veteran_cross_val_plot <- ggplot(veteran_accuracies, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Neighbors", y = "Accuracy Estimate", title = "Veteran Accuracies") +
    theme(text = element_text(size = 12))

pro_accuracies <- pro_fit |> collect_metrics() |>
  filter(.metric == "accuracy")
pro_cross_val_plot <- ggplot(pro_accuracies, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Neighbors", y = "Accuracy Estimate", title = "Pro Accuracies") +
    theme(text = element_text(size = 12))


In [None]:
# Retrain model using best k
set.seed(4923)
beginner_mnist_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
    set_engine("kknn") |>
    set_mode("classification")
beginner_mnist_fit <- fit(beginner_mnist_spec, subscribe ~ played_hours + Age, data = beginner_train)

regular_mnist_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 6) |>
    set_engine("kknn") |>
    set_mode("classification")
regular_mnist_fit <- fit(regular_mnist_spec, subscribe ~ played_hours + Age, data = regular_train)

amateur_mnist_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 18) |>
    set_engine("kknn") |>
    set_mode("classification")
amateur_mnist_fit <- fit(amateur_mnist_spec, subscribe ~ played_hours + Age, data = amateur_train)

veteran_mnist_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 20) |>
    set_engine("kknn") |>
    set_mode("classification")
veteran_mnist_fit <- fit(veteran_mnist_spec, subscribe ~ played_hours + Age, data = veteran_train)

pro_mnist_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
    set_engine("kknn") |>
    set_mode("classification")
pro_mnist_fit <- fit(pro_mnist_spec, subscribe ~ played_hours + Age, data = pro_train)

In [None]:
# See results and accuracies
set.seed(4923)
beginner_mnist_predictions <- predict(beginner_mnist_fit, beginner_test) |> bind_cols(beginner_test)
beginner_mnist_metrics <- beginner_mnist_predictions |> 
    metrics(truth = subscribe, estimate = .pred_class) |> 
    filter(.metric == "accuracy")

regular_mnist_predictions <- predict(regular_mnist_fit, regular_test) |> bind_cols(regular_test)
regular_mnist_metrics <- regular_mnist_predictions |> 
    metrics(truth = subscribe, estimate = .pred_class) |> 
    filter(.metric == "accuracy")

amateur_mnist_predictions <- predict(amateur_mnist_fit, amateur_test) |> bind_cols(amateur_test)
amateur_mnist_metrics <- amateur_mnist_predictions |> 
    metrics(truth = subscribe, estimate = .pred_class) |> 
    filter(.metric == "accuracy")

In [None]:
#Beginner
options(repr.plot.width = 5, repr.plot.height = 5)
set.seed(4923)

beginner_mnist_metrics

In [None]:
#Regular
options(repr.plot.width = 5, repr.plot.height = 5)
set.seed(4923)




regular_mnist_metrics

In [None]:
#Amateur
options(repr.plot.width = 5, repr.plot.height = 5)
set.seed(4923)









amateur_mnist_metrics

In [None]:
#Vetern
options(repr.plot.width = 5, repr.plot.height = 5)
set.seed(4923)









veteran_mnist_metrics

In [None]:
#Pro
options(repr.plot.width = 5, repr.plot.height = 5)
set.seed(4923)


pro_mnist_predictions <- predict(pro_mnist_fit, pro_test) |> bind_cols(pro_test)

pro_mnist_metrics <- pro_mnist_predictions |> 
    metrics(truth = subscribe, estimate = .pred_class) |> 
    filter(.metric == "accuracy")

pro_mnist_metrics

## Discussion

- summarize what you found
- discuss whether this is what you expected to find?
- discuss what impact could such findings have?
- discuss what future questions could this lead to?

## References
- You may include references if necessary, as long as they all have a consistent citation style.