In [None]:
import pickle
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score

In [None]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

In [None]:
clean_data_path = DATA_DIR / 'processed' / 'csgo_clean.pkl'

In [None]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

Checking if the data was properly read.

In [None]:
model_data = data.copy()
data.info()

-------------------------------

# Splitting the data

The data will be split into be split into 2 sets: training and testing. The training set will be used to train the model and the testing set will be used to evaluate the model.

In [None]:
X = model_data.drop(columns=['round_winner']).copy()
y = model_data['round_winner'].copy()

Creating the test set with 20% of the data.

In [None]:
x_train_val, x_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Creating the train and validation sets with 80% of the data.

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2, random_state=42)

In [None]:
x_train.shape, x_val.shape, x_test.shape

--------------------------------

# Choosing the model

In [None]:
def calculate_model_quality(model,x_val, y_val):
    y_predicted = model.predict(x_val)
    accuracy = accuracy_score(y_val, y_predicted)
    f1 = f1_score(y_val, y_predicted)
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1: {f1:.2f}')
    

### LogisticRegression

In [None]:
# 0.75 But it doesn't converge
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1750
                           )
model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

### RandomForestClassifier

In [26]:
# 0.87 
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=30,
    random_state=42,
    min_samples_split=2,
)

model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

Accuracy: 0.87
F1: 0.87


### DecisionTreeClassifier

In [None]:
# 0.81
from sklearn import tree

model = tree.DecisionTreeClassifier(
    random_state=42,
    max_depth=60,
)

model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

### GradientBoostingClassifier

In [None]:
# 0.85
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(
    n_estimators=75, 
    learning_rate=0.8, 
    max_depth=30,
    random_state=42
)
model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

### KNeighborsClassifier

In [None]:
# 0.75
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

### Mini Models!

This model was sugested by the professor, and it's idea is to split the data into several parts. Then, going through chunks of three parts, three models are trained and the majority vote is taken as the final prediction.

The cell bellow is a showcase of the model. Using just 1 split, the data will not be split into chunks, and the result should be the same as the 
[RandomForestClassifier](#randomforestclassifier)

In [25]:
import MiniModels
from sklearn.ensemble import RandomForestClassifier

Splitmodel = MiniModels.MiniModels()
model_used = RandomForestClassifier(
    n_estimators=100,
    max_depth=30,
    random_state=42,
    min_samples_split=2,
)

Splitmodel.fit(x_train, y_train, model_used, n_splits=1)
calculate_model_quality(Splitmodel, x_val, y_val)

Self.parts:
Part 0 size: (78342, 76), time_left: 175.0 - 0.03
Accuracy: 0.87
F1: 0.87


In [None]:
import MiniModels
from sklearn.ensemble import RandomForestClassifier

Splitmodel = MiniModels.MiniModels()
# model_used = RandomForestClassifier(random_state=42, max_depth=25, n_estimators=100)
model_used = RandomForestClassifier(
    n_estimators=100,
    max_depth=30,
    random_state=42,
)

Splitmodel.fit(x_train, y_train, model_used, 5)
calculate_model_quality(Splitmodel, x_val, y_val)

In [None]:
import MiniModels
from sklearn import tree

Splitmodel = MiniModels.MiniModels()
model_used = tree.DecisionTreeClassifier(random_state=42, max_depth=20)

Splitmodel.fit(x_train, y_train, model_used, 1)
calculate_model_quality(Splitmodel, x_val, y_val)

In [None]:
import MiniModels
from sklearn.ensemble import GradientBoostingClassifier

Splitmodel = MiniModels.MiniModels()
model_used = GradientBoostingClassifier(
    n_estimators=75, 
    learning_rate=0.8, 
    max_depth=30,
    random_state=42
)

Splitmodel.fit(x_train, y_train, model_used, 5)
calculate_model_quality(Splitmodel, x_val, y_val)

--------------------------------

In [None]:
# 0.87 
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=30,
    random_state=42,
    min_samples_split=2,
)

model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

-----

# Fine tuning with GridSearchCV

In [None]:
# import gridSearchCV
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train_val, y_train_val)

print(grid_search.best_params_)
print(grid_search.best_score_)

# Certification

Training and evaluating the best model with the test set.

In [None]:
# 0.87 
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=50,
    random_state=42,
    min_samples_split=2,
)

model.fit(x_train_val, y_train_val)
calculate_model_quality(model, x_test, y_test)

Evualating model quality using a DummyClassifier.

In [None]:
# Most frequent class
most_frequent_class = y_train_val.value_counts()
print(most_frequent_class) # Print the Distribution of the classes

# Get the most frequent class
most_frequent_class = most_frequent_class.idxmax()

# Create an array with the same shape as y_test and fill it with the most frequent class
y_predicted = np.full_like(y_test, fill_value=most_frequent_class)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_predicted)
print()
print(f'Accuracy: {accuracy:.2f}')

Model was certified with 0.88 accuracy and beat the DummyClassifier.

```markdown
👍
```

----

# Preparing the Deployment

Using the elected model, the data will be trained again, but now with the full dataset.

In [None]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=30,
    random_state=42,
    min_samples_split=2,
)

model.fit(X, y) # Train the model with the full dataset

In [None]:
# Save the model
model_path = DATA_DIR / 'models' / 'csgo_model.pkl'

with open(model_path, 'wb') as file:
    pickle.dump(model, file)