In [1]:
# load libraries
import numpy as np
import pandas as pd
import altair as alt
from sklearn import set_config
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

set_config(transform_output="pandas")

In [2]:
# load dfs
players_url = 'https://drive.google.com/uc?id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz&export=download'
players = pd.read_csv(players_url)
players.drop(columns=['individualId','organizationName'],inplace = True)

sessions_url = 'https://drive.google.com/uc?id=14O91N5OlVkvdGxXNJUj5jIsV5RexhzbB&export=download'
sessions = pd.read_csv(sessions_url)

players

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21
...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17


In [3]:
players_train, players_test = train_test_split(
    players, train_size=0.80, random_state=100, stratify=players['subscribe']
)

players_preprocessor = make_column_transformer(
    (StandardScaler(), ['played_hours', 'age']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

players_pipe = make_pipeline(players_preprocessor, KNeighborsClassifier())

param_grid = { 'kneighborsclassifier__n_neighbors': range(1,31,1) }

players_search = GridSearchCV(
    estimator=players_pipe,
    param_grid=param_grid,
    cv=5,
    return_train_score=True,
    n_jobs=-1
)

players_search.fit(
    players_train[['played_hours','age']],
    players_train['subscribe']
)

cv_results = pd.DataFrame(players_search.cv_results_)
cv_results.sort_values(by='rank_test_score').head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
10,0.004325,7.9e-05,0.004954,0.000363,11,{'kneighborsclassifier__n_neighbors': 11},0.75,0.774194,0.741935,0.806452,...,0.762903,0.024778,1,0.774194,0.744,0.8,0.76,0.76,0.767639,0.018792
11,0.004297,8.7e-05,0.004703,5.7e-05,12,{'kneighborsclassifier__n_neighbors': 12},0.75,0.774194,0.709677,0.806452,...,0.756452,0.032419,2,0.774194,0.728,0.776,0.752,0.76,0.758039,0.017471
13,0.004267,6.2e-05,0.004715,5.9e-05,14,{'kneighborsclassifier__n_neighbors': 14},0.75,0.741935,0.709677,0.806452,...,0.756452,0.032419,2,0.774194,0.76,0.736,0.736,0.752,0.751639,0.014614
23,0.004279,7.4e-05,0.004707,3e-05,24,{'kneighborsclassifier__n_neighbors': 24},0.75,0.774194,0.741935,0.774194,...,0.756452,0.014783,2,0.758065,0.768,0.736,0.736,0.744,0.748413,0.012684
7,0.004292,7.6e-05,0.004686,5.7e-05,8,{'kneighborsclassifier__n_neighbors': 8},0.75,0.806452,0.741935,0.806452,...,0.756452,0.047955,5,0.782258,0.784,0.8,0.752,0.808,0.785252,0.019242


In [4]:
cross_val_plot= alt.Chart(cv_results).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors")
        .title('Number of Neighbors (k)'),
    y=alt.Y('mean_test_score')
        .title('Accuracy')
        .scale(zero=False)
)

cross_val_plot

In [5]:
players_spec = KNeighborsClassifier(n_neighbors = 14)
players_fit = players_spec.fit(players_train[['age','played_hours']],players_train['subscribe'])
players_pred = players_fit.predict(players_test[['age','played_hours']])
players_eval = players_test.assign(actual=players_test['subscribe'],predicted=players_pred)
players_conf_mat = pd.crosstab(players_eval['actual'], players_eval['predicted'])
players_conf_mat

predicted,False,True
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0,11
True,2,27


In [6]:
players_acc = players_fit.score(
    players_test[['age','played_hours']],
    players_test['subscribe']
)
players_acc

0.675