In [1]:
import pathlib
import pickle
import requests

import pandas as pd


# Analysis and Visualization of the Data

## Setting up the directory and loading the data

In [24]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

/home/fernando/6/machinelearning/machine_learning/CSRoundPrediction/data


In [25]:
DATA_DIR.mkdir(parents=True, exist_ok=True)

In [26]:
raw_data_dir = DATA_DIR / 'raw'
raw_data_dir.mkdir(parents=True, exist_ok=True)
print(raw_data_dir)

/home/fernando/6/machinelearning/machine_learning/CSRoundPrediction/data/raw


In [27]:
raw_data_file_path = DATA_DIR / 'raw' / 'csgo_round_snapshots.csv'
print(raw_data_file_path)

/home/fernando/6/machinelearning/machine_learning/CSRoundPrediction/data/raw/csgo_round_snapshots.csv


In [28]:
raw_data = pd.read_csv(raw_data_file_path)

## Visualizing the data

In [8]:
raw_data.shape

(122410, 97)

In [9]:
raw_data.head()

Unnamed: 0,time_left,ct_score,t_score,map,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,...,t_grenade_flashbang,ct_grenade_smokegrenade,t_grenade_smokegrenade,ct_grenade_incendiarygrenade,t_grenade_incendiarygrenade,ct_grenade_molotovgrenade,t_grenade_molotovgrenade,ct_grenade_decoygrenade,t_grenade_decoygrenade,round_winner
0,175.0,0.0,0.0,de_dust2,False,500.0,500.0,0.0,0.0,4000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CT
1,156.03,0.0,0.0,de_dust2,False,500.0,500.0,400.0,300.0,600.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,CT
2,96.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,CT
3,76.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CT
4,174.97,1.0,0.0,de_dust2,False,500.0,500.0,192.0,0.0,18350.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CT


In [10]:
data = raw_data.copy()

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122410 entries, 0 to 122409
Data columns (total 97 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   time_left                     122410 non-null  float64
 1   ct_score                      122410 non-null  float64
 2   t_score                       122410 non-null  float64
 3   map                           122410 non-null  object 
 4   bomb_planted                  122410 non-null  bool   
 5   ct_health                     122410 non-null  float64
 6   t_health                      122410 non-null  float64
 7   ct_armor                      122410 non-null  float64
 8   t_armor                       122410 non-null  float64
 9   ct_money                      122410 non-null  float64
 10  t_money                       122410 non-null  float64
 11  ct_helmets                    122410 non-null  float64
 12  t_helmets                     122410 non-nul

In [13]:
data.dtypes.value_counts()

float64    94
object      2
bool        1
Name: count, dtype: int64

## Selecting the features

In [33]:
continuous_variables = [
    'time_left'
]

discrete_variables = [
    "time_left", 
    "ct_score", 
    "t_score", 
    "map", 
    "bomb_planted", 
    "ct_health", 
    "t_health", 
    "ct_armor",
    "t_armor", 
    "ct_money", 
    "t_money", 
    "ct_helmets", 
    "t_helmets", 
    "ct_defuse_kits", 
    "ct_players_alive", 
    "t_players_alive", 
    "ct_weapon_ak47", 
    "t_weapon_ak47", 
    "ct_weapon_aug", 
    "t_weapon_aug", 
    "ct_weapon_awp", 
    "t_weapon_awp", 
    "ct_weapon_bizon", 
    "t_weapon_bizon", 
    "ct_weapon_cz75auto",
    "t_weapon_cz75auto", 
    "ct_weapon_elite", 
    "t_weapon_elite", 
    "ct_weapon_famas", 
    "t_weapon_famas", 
    "ct_weapon_g3sg1", 
    "t_weapon_g3sg1", 
    "ct_weapon_galilar", 
    "t_weapon_galilar", 
    "ct_weapon_glock", 
    "t_weapon_glock", 
    "ct_weapon_m249", 
    "t_weapon_m249", 
    "ct_weapon_m4a1s", 
    "t_weapon_m4a1s", 
    "ct_weapon_m4a4", 
    "t_weapon_m4a4", 
    "ct_weapon_mac10", 
    "t_weapon_mac10", 
    "ct_weapon_mag7", 
    "t_weapon_mag7", 
    "ct_weapon_mp5sd", 
    "t_weapon_mp5sd", 
    "ct_weapon_mp7", 
    "t_weapon_mp7", 
    "ct_weapon_mp9", 
    "t_weapon_mp9", 
    "ct_weapon_negev", 
    "t_weapon_negev", 
    "ct_weapon_nova", 
    "t_weapon_nova", 
    "ct_weapon_p90", 
    "t_weapon_p90", 
    "ct_weapon_r8revolver", 
    "t_weapon_r8revolver", 
    "ct_weapon_sawedoff", 
    "t_weapon_sawedoff", 
    "ct_weapon_scar20", 
    "t_weapon_scar20", 
    "ct_weapon_sg553", 
    "t_weapon_sg553", 
    "ct_weapon_ssg08", 
    "t_weapon_ssg08", 
    "ct_weapon_ump45", 
    "t_weapon_ump45", 
    "ct_weapon_xm1014", 
    "t_weapon_xm1014", 
    "ct_weapon_deagle", 
    "t_weapon_deagle", 
    "ct_weapon_fiveseven", 
    "t_weapon_fiveseven", 
    "ct_weapon_usps", 
    "t_weapon_usps", 
    "ct_weapon_p250", 
    "t_weapon_p250", 
    "ct_weapon_p2000", 
    "t_weapon_p2000", 
    "ct_weapon_tec9", 
    "t_weapon_tec9", 
    "ct_grenade_hegrenade", 
    "t_grenade_hegrenade", 
    "ct_grenade_flashbang", 
    "t_grenade_flashbang", 
    "ct_grenade_smokegrenade", 
    "t_grenade_smokegrenade", 
    "ct_grenade_incendiarygrenade", 
    "t_grenade_incendiarygrenade", 
    "ct_grenade_molotovgrenade", 
    "t_grenade_molotovgrenade", 
    "ct_grenade_decoygrenade", 
    "t_grenade_decoygrenade", 
    "round_winner"
]

categorical_variables = [
    'map',
    'round_winner',
]

In [34]:
for col in categorical_variables:
    data[col] = data[col].astype('category')

In [35]:
data \
    .select_dtypes('category') \
    .describe() \
    .transpose() \
    .sort_values(by='count', ascending=True)

Unnamed: 0,count,unique,top,freq
map,122410,8,de_inferno,23811
round_winner,122410,2,T,62406


In [36]:
data \
    .select_dtypes('number') \
    .describe() \
    .transpose() \
    .sort_values(by='count', ascending=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
time_left,122410.0,97.886922,54.465238,0.01,54.92,94.91,166.9175,175.0
ct_score,122410.0,6.709239,4.790362,0.00,3.00,6.00,10.0000,32.0
t_score,122410.0,6.780435,4.823543,0.00,3.00,6.00,10.0000,33.0
ct_health,122410.0,412.106568,132.293290,0.00,350.00,500.00,500.0000,500.0
t_health,122410.0,402.714500,139.919033,0.00,322.00,500.00,500.0000,600.0
...,...,...,...,...,...,...,...,...
t_grenade_incendiarygrenade,122410.0,0.019819,0.143933,0.00,0.00,0.00,0.0000,3.0
ct_grenade_molotovgrenade,122410.0,0.048011,0.227669,0.00,0.00,0.00,0.0000,3.0
t_grenade_molotovgrenade,122410.0,1.352095,1.663246,0.00,0.00,1.00,2.0000,5.0
ct_grenade_decoygrenade,122410.0,0.027694,0.169531,0.00,0.00,0.00,0.0000,3.0


## Saving the data

In [37]:
processed_dir = DATA_DIR / 'processed'
processed_dir.mkdir(parents=True, exist_ok=True)

In [38]:
processed_file_path = processed_dir / 'csgo.pkl'

In [39]:
with open(processed_file_path, 'wb') as file:
    pickle.dump(
        [
            data,
            continuous_variables,
            discrete_variables,
            categorical_variables,
        ],
        file,
    )