In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

import struct

from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelBinarizer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

from sklearn.model_selection import train_test_split

# Get the data

In [2]:
def load_csv(path):
    return pd.read_csv(path)

In [None]:
training_raw = load_csv("data/train_V2_copia.csv")

In [None]:
training_raw.head()

In [None]:
training_raw.info()

We have to format the three first attributes because of theses strings can't be used to fit the model.

In [None]:
training_format = training_raw.copy()

In [None]:
training_format["Id"] = np.arange(len(training_format))
training_format["groupId"] = np.arange(len(training_format))
training_format["matchId"] = np.arange(len(training_format))

In [None]:
training_raw.describe()

In [None]:
%matplotlib inline
training_raw.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#training_format["killPlaceAVG"] = training_format["killPlace"].mean()
#training_format["overallKills"] = 
#training_format["overallDBNOS"] = 
#training_format["overallMatches"] = 

### Create test set

In [None]:
train_set, test_set = train_test_split(training_format, test_size=0.2, random_state=42)

In [None]:
print(f'Training data: {train_set.shape}\nTest data: {test_set.shape}')

# Discover and visualize the data to gain insights

### Looking for Correlations

In [None]:
training_copy = train_set.copy()

In [None]:
corr_matrix = training_copy.corr()

In [None]:
corr_matrix["winPlacePerc"].sort_values(ascending=False)

##### Attributes that seem more correlated with "winPlacePerc" are:
walkDistance   
killPlace
boosts             
weaponsAcquired    
damageDealt        
heals              
kills 
longestKill       
killStreaks       
rideDistance     
assists           
DBNOs           
headshotKills   
revives

In [None]:
#attributes = ["winPlacePerc", "walkDistance", "killPlace",
#"boosts"]

#scatter_matrix(training_copy[attributes], figsize=(12, 8))

In [None]:
#training_copy.plot(kind="scatter", x="walkDistance", y="winPlacePerc",alpha=0.1)

In [None]:
#training_copy.plot(kind="scatter", x="matchDuration", y="winPlacePerc",alpha=0.1)

# Prepare the data for Machine Learning algorithms

### Pipelines

In [None]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

In [None]:
train = train_set.drop("winPlacePerc", axis=1)
train_labels = train_set["winPlacePerc"].copy()

In [None]:
test = test_set.drop("winPlacePerc", axis=1)
test_labels = test_set["winPlacePerc"].copy()

In [None]:
train_set = train_set.drop("winPlacePerc", axis=1)
test_set = test_set.drop("winPlacePerc", axis=1)

In [None]:
num_attribs = list(train_set.drop("matchType", axis=1))
cat_attribs = ["matchType"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', MyLabelBinarizer()),
    ])

In [None]:
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [None]:
PUBG_TRAIN = full_pipeline.fit_transform(train)
PUBG_TEST = full_pipeline.fit_transform(test)

# Select a model and train it

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

LinearRegression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(PUBG_TRAIN, train_labels)

In [None]:
PUBG_predictions = lin_reg.predict(PUBG_TEST)
lin_mse = mean_squared_error(test_labels, PUBG_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
lin_scores = cross_val_score(lin_reg, PUBG_TEST, test_labels,
scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

DecisionTreeRegressor

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(PUBG_TRAIN, train_labels)

In [None]:
PUBG_predictions = tree_reg.predict(PUBG_TEST)
tree_mse = mean_squared_error(test_labels, PUBG_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
tree_scores = cross_val_score(tree_reg, PUBG_TEST, test_labels,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

RandomForestRegressor

With GridSearchCV this is the best estimator for RandomForestRegressor

In [None]:
forest_reg =RandomForestRegressor(bootstrap=True, max_features=8, n_estimators=30)
forest_reg.fit(PUBG_TRAIN, train_labels)

In [None]:
PUBG_predictions = forest_reg.predict(PUBG_TEST)
forest_mse = mean_squared_error(test_labels, PUBG_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
forest_scores = cross_val_score(forest_reg, PUBG_TEST, test_labels,
scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

### Trying extra models

HuberRegressor

In [None]:
reg = linear_model.HuberRegressor()

In [None]:
reg.fit(PUBG_TRAIN, train_labels)

In [None]:
PUBG_predictions = reg.predict(PUBG_TEST)
reg_mse = mean_squared_error(test_labels, PUBG_predictions)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

In [None]:
reg_scores = cross_val_score(reg, PUBG_TEST, test_labels,
scoring="neg_mean_squared_error", cv=10)
reg_scores = np.sqrt(-reg_scores)
display_scores(reg_scores)

SGDRegressor

In [None]:
reg2 = linear_model.SGDRegressor()

In [None]:
reg2.fit(PUBG_TRAIN, train_labels)

In [None]:
PUBG_predictions = reg2.predict(PUBG_TEST)
reg2_mse = mean_squared_error(test_labels, PUBG_predictions)
reg2_rmse = np.sqrt(reg2_mse)
reg2_rmse

In [None]:
reg2_scores = cross_val_score(reg2, PUBG_TEST, test_labels,
scoring="neg_mean_squared_error", cv=10)
reg2_scores = np.sqrt(-reg2_scores)
display_scores(reg2_scores)

MLPRegressor

In [None]:
from sklearn.neural_network import MLPRegressor
reg3 = MLPRegressor()

In [None]:
reg3.fit(PUBG_TRAIN, train_labels)

In [None]:
PUBG_predictions = reg3.predict(PUBG_TEST)
reg3_mse = mean_squared_error(test_labels, PUBG_predictions)
reg3_rmse = np.sqrt(reg3_mse)
reg3_rmse

In [None]:
reg3_scores = cross_val_score(reg3, PUBG_TEST, test_labels,
scoring="neg_mean_squared_error", cv=10)
reg3_scores = np.sqrt(-reg3_scores)
display_scores(reg3_scores)

### MSE and RMSE of all models

In [None]:
benchmark_dict = {'MSE':[lin_mse,
                                     tree_mse,
                                     forest_mse,
                                     reg_mse,
                                     reg2_mse,
                                     reg3_mse],
                  'RMSE':[lin_rmse,
                                     tree_rmse,
                                     forest_rmse,
                                     reg_rmse,
                                     reg2_rmse,
                                     reg3_rmse]
                 }
benchmark_data_frame = pd.DataFrame(data=benchmark_dict,
                                    index =['Linear Regression',
                                            'Decision Tree Regressor',
                                            'Random Forest Regressor',
                                            'Huber Regressor',
                                            'SGD Regressor',
                                            'MLP Regressor']
                                    )
benchmark_data_frame

### Cross validation scores of all models

In [None]:
benchmark_dict = {'Mean':[lin_scores.mean(),
                                     tree_scores.mean(),
                                     forest_scores.mean(),
                                     reg_scores.mean(),
                                     reg2_scores.mean(),
                                     reg3_scores.mean()],
                  'Standard deviation':[lin_scores.std(),
                                     tree_scores.std(),
                                     forest_scores.std(),
                                     reg_scores.std(),
                                     reg2_scores.std(),
                                     reg3_scores.std()]
                 }
benchmark_data_frame = pd.DataFrame(data=benchmark_dict,
                                    index =['Linear Regression',
                                            'Decision Tree Regressor',
                                            'Random Forest Regressor',
                                            'Huber Regressor',
                                            'SGD Regressor',
                                            'MLP Regressor']
                                    )
benchmark_data_frame