To begin with, the neccesary packages are imported

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn


%matplotlib inline

ModuleNotFoundError: No module named 'sklearn'

Next, the data is read into a pandas dataframe

In [None]:
df = pd.read_csv("../data/player_stats-27-01-2023.csv")
print(f"Data Shape is {df.shape}")
df.sample(10)

: 

Next, I will check the data for any nulls

In [None]:
print(f"There are {df.isna().sum().sum()} nulls in the dataset")

: 

Next I will look into the columns to understand the datatypes

In [None]:
df.info();

: 

After establishing that `'snd_first_blood_rate'` is of the 'object' datatype (often used to represent strings in numpy arrays),
I have learnt that I need to remove the % symbols from the end of the floating point number


In [None]:
df['snd_first_blood_rate'] = df['snd_first_blood_rate'].astype('string')
df['snd_first_blood_rate'] = df['snd_first_blood_rate'].str.replace('%', '', regex=True)
df['snd_first_blood_rate'] = df['snd_first_blood_rate'].astype("float")

print(f"An example of a value in the columns snd_first_blood_rate is {df['snd_first_blood_rate'][1]}");
print(f"The datatype of the column is now '{df['snd_first_blood_rate'].dtype}'")

: 

In [None]:
print(f"The datsets columns are {df.columns.values}")

: 

It is useful to remove players who have a small number of maps played as it isn't a strong representation
of long term performance

In [None]:
df['hp_maps_played'].value_counts().sort_index(ascending=True)

: 

After looking at the data for matches played in hardpoints, I have decided to remove anyone with less than 10 maps played in hardpoint.
This is being done under the assumption that players will still have a decent number of maps played in other gamemodes

In [None]:
df = df[df['hp_maps_played'] >= 10]
df.reset_index(drop=True, inplace=True)
df['hp_maps_played'].value_counts().sort_index(ascending=True)

: 

In [None]:
df['snd_maps_played'].value_counts().sort_index(ascending=True)

: 

In [None]:
df['control_maps_played'].value_counts().sort_index(ascending=True)

: 

After removing players with <10 hardpoint maps played, the minimum number of maps played in one gamemode is 6 (control)

In [None]:
df[df['control_maps_played'] < 7]

: 

These appear to be players who have recently been added to the LAG roster, after being promoted from the academy team (recent as of 27/01/2023)
I feel like they haven't played 6 control maps as they haven't played 6 series in stage 2???
This inaccuracy may be a result of challengers stats being counted. To increase the depth of the data however, they will still be included

To determine if my scaled data is accuracte in showing that a player is high in both categories, I need to look at the raw data

In [None]:
df['scaled'] = ((df['hp_kills_per_10']/df['hp_kills_per_10'].mean())+(df['avg_hp_hill_time']/df['avg_hp_hill_time'].mean()))/2
df[['player', 'hp_kills_per_10', 'avg_hp_hill_time', 'scaled']].sort_values(by='scaled').style.background_gradient(cmap='coolwarm')

: 

Looking at this, it is clear that the scaled data is succesfuly determining what players have a high average between both statlines.
Some outcomes include:
- Accuracy has low kills per ten minutes but makes up for it in hill time
- JoeDeceives has the highest 'impact' in terms of having high hill time and kills per ten minutes
- MajorManiak has the lowest, with low kills per ten minutes and lower hill time


In [None]:
#Create function to plot scatter graphs comparing two numerical columns
#It weight them by multiplying both columns
#This leads to higher values being red and lower columns being blue
#Colours don't neccesarily correlate to good or bad
#This is dependent on the data in the inputted columns

def scatter_comparison(axis: plt.Axes, title: str , xcol: str, ycol: str, xlabel: str, ylabel: str,
    show_min_max: bool = True, player_col: np.array = None, show_all_players: bool = False):
    """
    Desc: This function plots two columns from the data into
          a scatter plot on the axis provided
    Params:
        axis: the axis to draw the plot on
        title: The title of the plot
        xcol: the column to use as the x values in the plot
        ycol: the column to use for y values
        xlabel:
        ylabel:
        show_min_max: whether or not to plot the player names of the min and max players (from scaled data)
        player_col: column containing player names
    Return: None
    """
    #Scale the data
    #The scale is based on what percentage of the mean value a value makes up
    #This is then averaged between the two datapoints
    scaled_weight = ((xcol/xcol.mean())+(ycol/ycol.mean()))/2
    axis.scatter(x=xcol, y=ycol, c=scaled_weight, cmap='coolwarm')
    axis.set_title(f"{title}\nMIN: {player_col[(scaled_weight).argmin()]} | MAX: {player_col[(scaled_weight).argmax()]}")
    axis.set_xlabel(xlabel)
    axis.set_ylabel(ylabel)
    if show_all_players:
        for i in range(len(player_col)):
            axis.text(x=xcol[i], y=ycol[i], s=player_col[i], alpha=0.4)
        min_player_index = (scaled_weight).argmin()
        min_player = player_col[min_player_index]
        axis.text(s=min_player, x=xcol[min_player_index], y=ycol[min_player_index])

        max_player_index = (scaled_weight).argmax()
        max_player = player_col[max_player_index]
        axis.text(s=max_player, x=xcol[max_player_index], y=ycol[max_player_index])

fig = plt.figure(figsize=(14, 14))
plt.suptitle("CDL Hardpoint (Players) 2023")
((ax1, ax2), (ax3, ax4)) = fig.subplots(ncols=2, nrows=2, sharey=False)


#Some example plots to test the func
scatter_comparison(ax1, "HP Kills per 10 Minutes vs Average HP Hill Time (Impact)", df['hp_kills_per_10'], df['avg_hp_hill_time'],
                "HP Kills per 10 Minutes", "Average Hill Time", False, df['player'], True)

scatter_comparison(ax2, "HP Damage per 10 Minutes vs Average HP Hill Time (Impact)", df['hp_dmg_per_10'], df['avg_hp_hill_time'],
                "HP Damage per 10 Minutes", "Average Hill Time", False, df['player'], True)

scatter_comparison(ax3, "HP Damage per 10 Minutes vs HP KD (Consistency)", df['hp_dmg_per_10'], df['hp_kd'],
                "HP Damage per 10 Minutes", "HP KD", False, df['player'], True)

scatter_comparison(ax4, "HP Kills per 10 Minutes vs HP KD", df['hp_kills_per_10'], df['hp_kd'],
                "HP Kills per 10 Minutes", "HP KD", False, df['player'], True)

: 

After looking at some examples of hardpoint, I have decided to look at some SND examples.
Data I've decided to plot includes:
- SND first blood rate vs SND KD
- SND kill per round vs SND KD (Should positively correlate)
- SND KD vs Average Respawn KD (Should highlight players who excel in certain gamemodes)

In [None]:
#Create a new figure with 3 columns
fig = plt.figure(figsize=(18, 7))
(ax1, ax2, ax3) = fig.subplots(ncols=3)
fig.suptitle("CDL SnD (Players) 2023")

#not really sure why the columns called snd_kd_rate, should be kd_ratio or just snd_kd
scatter_comparison(ax1, "First blood % vs KD Ratio", df['snd_first_blood_rate'], df['snd_kd_rate'], 
                    "First Blood %", "KD Ratio", True, df['player'], True)

scatter_comparison(ax2, "Average Kills per Round vs KD Ratio", df['snd_kill_per_round'], df['snd_kd_rate'],
                     "Average Kills per Round", "KD Ratio", True, df['player'], True)

respawn_average_kd = (df['control_kd']+df['hp_kd'])/2
scatter_comparison(ax3, "SnD KD vs Average Respawn KD", df['snd_kd_rate'], respawn_average_kd, 
                    "SnD KD", "Average Respawn KD", True, df['player'], True)

: 

Before adding a line for the league average onto the plots, I must first decide whether to use the mean or the median.
To do this, I will plot the distributions for some columns in the df

In [None]:
from sklearn.preprocessing import normalize

#Normalize the data so that it can be plotted on one graph and the distributions can be compared
av_kd_norm = normalize(np.array([respawn_average_kd]))
snd_kd_norm = normalize(np.array([df['snd_kd_rate']]))
snd_kill_per_round_norm = normalize(np.array([df['snd_kill_per_round']]))

#Plot the distributions
pd.Series(av_kd_norm[0]).plot.density()
pd.Series(snd_kd_norm[0]).plot.density()
pd.Series(snd_kill_per_round_norm[0]).plot.density();

: 

From this plot, we can see that the distribution is mostly symmetrical and not too skewed.
This indicates that the mean can be used as an 'average' rather than a median

In [None]:
#While this is bad practice, and could be inserted into the scatter_comparison function, 
#I will create a new function to place the average onto the plots.
#I have decided to create a new function for the flow of the notebook
def show_average(axis: plt.Axes, y_data, x_data):
    """
    Desc: This function plots lines indicating the league average.
          This allows for players who are below the league average to be identified.
    Params:
        axis: The ax for the average to be drawn onto
        x_data:
        y_data:
    Return:
        None
    """
    y_min = axis.get_ylim()[0]
    x_mean = x_data.mean()
    y_mean = y_data.mean()
    x_min = axis.get_xlim()[0]

    axis.set_ylim(y_min)
    axis.set_xlim(x_min)

    axis.plot([x_mean, x_mean], [y_min, y_mean], color='green', linestyle='dashed', label='League Average')
    axis.plot([x_min, x_mean], [y_mean, y_mean], color='green', linestyle='dashed')
    axis.legend()



: 

Now that our function has been created, we can re-plot the original SnD data but this time with the league average drawn on

In [None]:
#Once again, this is repeated code.
#This has however been left in for the continuity of the notebook
fig = plt.figure(figsize=(18, 7))
(ax1, ax2, ax3) = fig.subplots(ncols=3)
fig.suptitle("CDL SnD (Players) 2023")

scatter_comparison(ax1, "First blood % vs KD Ratio", df['snd_first_blood_rate'], df['snd_kd_rate'], 
                    "First Blood %", "KD Ratio", True, df['player'], True)

scatter_comparison(ax2, "Average Kills per Round vs KD Ratio", df['snd_kill_per_round'], df['snd_kd_rate'],
                     "Average Kills per Round", "KD Ratio", True, df['player'], True)

respawn_average_kd = (df['control_kd']+df['hp_kd'])/2
scatter_comparison(ax3, "SnD KD vs Average Respawn KD", df['snd_kd_rate'], respawn_average_kd, 
                    "SnD KD", "Average Respawn KD", True, df['player'], True)

show_average(ax1,  df['snd_kd_rate'], df['snd_first_blood_rate'])
show_average(ax2, df['snd_kd_rate'], df['snd_kill_per_round'])
show_average(ax3,  df['snd_kd_rate'], respawn_average_kd)


: 