In [1]:
import pickle
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

c:\Users\luis\Documents\6\CSRoundPrediction\data


In [3]:
clean_data_path = DATA_DIR / 'processed' / 'csgo_clean.pkl'

In [4]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

Checking if the data was properly read.

In [5]:
model_data = data.copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122410 entries, 0 to 122409
Data columns (total 76 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   time_left                     122410 non-null  float64
 1   ct_score                      122410 non-null  float64
 2   t_score                       122410 non-null  float64
 3   bomb_planted                  122410 non-null  bool   
 4   ct_health                     122410 non-null  float64
 5   t_health                      122410 non-null  float64
 6   ct_armor                      122410 non-null  float64
 7   t_armor                       122410 non-null  float64
 8   ct_money                      122410 non-null  float64
 9   t_money                       122410 non-null  float64
 10  ct_helmets                    122410 non-null  float64
 11  t_helmets                     122410 non-null  float64
 12  ct_defuse_kits                122410 non-nul

-------------------------------

# Splitting the data

The data will be split into be split into 2 sets: training and testing. The training set will be used to train the model and the testing set will be used to evaluate the model.

In [7]:
X = model_data.drop(columns=['round_winner']).copy()
y = model_data['round_winner'].copy()

Creating the test set with 20% of the data.

In [8]:
x_train_val, x_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Creating the train and validation sets with 80% of the data.

In [9]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2, random_state=42)

In [10]:
x_train.shape, x_val.shape, x_test.shape

((78342, 75), (19586, 75), (24482, 75))

--------------------------------

# Choosing the model

In [10]:
def calculate_model_quality(model,x_val, y_val):
    y_predicted = model.predict(x_val)
    accuracy = accuracy_score(y_val, y_predicted)
    f1 = f1_score(y_val, y_predicted)
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1: {f1:.2f}')
    

### LogisticRegression

In [None]:
# 0.75 Mas não converge
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=750
                           )
model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

Accuracy: 0.75
F1: 0.75


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### RandomForestClassifier

In [27]:
# 0.87 
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=30,
    random_state=42,
    min_samples_split=2,
)

model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

Accuracy: 0.87
F1: 0.87


### RandomForestClassifier

In [74]:

from sklearn import tree

model = tree.DecisionTreeClassifier(
    random_state=42,
    max_depth=60,
)

model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

Accuracy: 0.81
F1: 0.81


### GradientBoostingClassifier

In [99]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(
    n_estimators=75, 
    learning_rate=0.8, 
    max_depth=30,
    random_state=42
)
model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

Accuracy: 0.85
F1: 0.85


### KNeighborsClassifier

In [100]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)
calculate_model_quality(model, x_val, y_val)

Accuracy: 0.75
F1: 0.75


### Mini Models!

This model was sugested by the professor, and it's idea is to split the data into several parts and train a model for each part. The model will be trained with a different part of the data, and the prediction will be a combination of the predictions of each model.

In [13]:
train_complete = pd.concat([x_train, y_train], axis=1)
val_complete = pd.concat([x_val, y_val], axis=1)

In [25]:
# Define the number of parts to split the dataset into
num_parts = 5

# Calculate the size of each part
part_size = len(train_complete) // num_parts

# Sort the dataset by the 'time_left' column
train_complete_sorted = train_complete.sort_values(by='time_left', ascending=False)

# Split the dataset into equal parts
parts = [train_complete_sorted.iloc[i*part_size:(i+1)*part_size] for i in range(num_parts)]

# If there are any remaining rows, add them to the last part
if len(train_complete) % num_parts != 0:
    parts[-1] = pd.concat([parts[-1], train_complete_sorted.iloc[num_parts*part_size:]])

# Display the size of each part
for i, part in enumerate(parts):
    print(f"Part {i} size: {len(part)}, most common class: {part['round_winner'].value_counts().idxmax()}, time range: {part['time_left'].min()} - {part['time_left'].max()}")


Part 0 size: 15668, most common class: 1, time range: 174.91 - 175.0
Part 1 size: 15668, most common class: 1, time range: 107.19 - 174.91
Part 2 size: 15668, most common class: 1, time range: 74.95 - 107.18
Part 3 size: 15668, most common class: 1, time range: 39.45 - 74.95
Part 4 size: 15670, most common class: 0, time range: 0.03 - 39.45


In [12]:
import MiniModels
from sklearn.ensemble import RandomForestClassifier

model = MiniModels.MiniModels()

model.fit(x_train, y_train, RandomForestClassifier, 5)


Part 0 size: 15668, time_left: 175.0 - 174.91
Part 1 size: 15668, time_left: 174.91 - 107.19
Part 2 size: 15668, time_left: 107.18 - 74.95
Part 3 size: 15668, time_left: 74.95 - 39.45
Part 4 size: 15670, time_left: 39.45 - 0.03
