In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from Preparation import prepare_to_file, prepare_and_return
from Scoring import score, initialize_result_file
from Scenarios import drop_columns,remove_missing,fill_missing_mode,fill_missing_max,fill_missing_min,fill_missing_mean,fill_missing_regression,fill_missing_zero,standardize,normalize,remove_outliers_lof,encode_categorical

In [2]:
lol_stats_1 = pd.read_csv('../Datasets/Prepared/lol_stats_1.csv')
lol_stats_2 = pd.read_csv('../Datasets/Prepared/lol_stats_2.csv')
lol_stats_3 = pd.read_csv('../Datasets/Prepared/lol_stats_3.csv')

In [3]:
lol_stats_1['Tier'].unique()

array([1, 4, 2, 3, 5, 0], dtype=int64)

In [4]:
lol_stats_1.loc[2]

Name         Akali
Class     Assassin
Role           MID
Tier             1
Score        65.49
Trend         4.33
Win %        48.41
Role %       75.74
Pick %        8.11
Ban %        13.02
KDA           2.37
Name: 2, dtype: object

In [5]:
categorical = ['Name', 'Class', 'Role','Tier']
numeric = ['Score','Trend','Win %','Role %','Pick %', 'Ban %','KDA']
to_be_encoded = ['Class', 'Role']

In [6]:
def no_preprocessing(df,num):
    df_1 = df.copy()
    df_1 = remove_missing(df_1)
    y = df_1['Tier']
    df_1= drop_columns(df_1,categorical)
    df_1 = df_1.apply(pd.to_numeric)
    X = df_1
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"No Preprocessing",X_train,y_train,X_test,y_test)

In [7]:
def fill_mean(df, num):
    df_2 = df.copy()
    df_2 = fill_missing_mean(df_2,numeric)
    y = df_2['Tier']
    df_2 = drop_columns(df_2,categorical)
    df_2 = df_2.apply(pd.to_numeric)
    X = df_2
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Fill Missing with mean",X_train,y_train,X_test,y_test)

In [8]:
def fill_min(df,num):
    df_3 = df.copy()
    df_3 = fill_missing_min(df_3,numeric)
    y = df_3['Tier']
    df_3 = df_3.drop(categorical,axis=1)
    df_3 = df_3.apply(pd.to_numeric)
    X = df_3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Fill_missing_with_min",X_train,y_train,X_test,y_test)

In [9]:
def fill_max(df,num):
    df_4 = df.copy()
    df_4 = fill_missing_max(df_4,numeric)
    y = df_4['Tier']
    df_4 = df_4.drop(categorical,axis=1)
    df_4 = df_4.apply(pd.to_numeric)
    X = df_4
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Fill_missing_with_max",X_train,y_train,X_test,y_test)

In [10]:
def fill_regression(df,num):
    df_new = df.copy()
    df_new = fill_missing_regression(df_new, numeric)
    y = df_new['Tier']
    df_new = df_new.drop(categorical,axis=1)
    df_new = df_new.apply(pd.to_numeric)
    X = df_new
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Regression",X_train,y_train,X_test,y_test)

In [11]:
def standardize_scenario(df,num):
    df_5 = df.copy()
    df_5 = fill_missing_mean(df_5,numeric)
    df_5 = standardize(df_5,numeric)
    y = df_5['Tier']
    df_5 = df_5.drop(categorical,axis=1)
    df_5 = df_5.apply(pd.to_numeric)
    X = df_5
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Standardize",X_train,y_train,X_test,y_test)

In [12]:
def normalize_scenario(df,num):
    df_6 = df.copy()
    df_6 = fill_missing_mean(df_6,numeric)
    df_6 = normalize(df_6,numeric)
    y = df_6['Tier']
    df_6 = df_6.drop(categorical,axis=1)
    df_6 = df_6.apply(pd.to_numeric)
    X = df_6
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Normalizacja",X_train,y_train,X_test,y_test)

In [13]:
def normalize_and_remove_outliers(df,num):
    df_7 = df.copy()
    df_7 = fill_missing_mean(df_7,numeric)
    df_7 = normalize(df_7,numeric)
    df_7 = remove_outliers_lof(df_7,numeric)
    y = df_7['Tier']
    df_7 = df_7.drop(categorical,axis=1)
    df_7 = df_7.apply(pd.to_numeric)
    X = df_7
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Normalization_with_LOF",X_train,y_train,X_test,y_test)

In [14]:
def encode_categorical_scenario(df,num):
    df_8 = df.copy()
    df_8 = remove_missing(df_8)
    df_8 = encode_categorical(df_8,to_be_encoded)
    y = df_8['Tier']
    df_8= drop_columns(df_8,['Name'])
    df_8 = df_8.apply(pd.to_numeric)
    X = df_8
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Label_Encoding",X_train,y_train,X_test,y_test)

In [15]:
def encode_categorical_and_fill_missing(df,num):
    df_9 = df.copy()
    df_9 = fill_missing_mean(df_9,numeric)
    df_9 = encode_categorical(df_9,to_be_encoded)
    y = df_9['Tier']
    df_9= drop_columns(df_9,['Name'])
    df_9 = df_9.apply(pd.to_numeric)
    X = df_9
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Label_encoding_+_fill_missing_mean",X_train,y_train,X_test,y_test)

In [22]:
def custom_scenario(df,num):
    df_10 = df.copy()
    df_10 = fill_missing_mean(df_10,numeric)
    df_10 = remove_outliers_lof(df_10,numeric)
    df_10 = normalize(df_10,numeric)
    df_10= drop_columns(df_10,['Name'])
    df_10 = encode_categorical(df_10,to_be_encoded)
    y = df_10['Tier']
    df_10 = df_10.apply(pd.to_numeric)
    X = df_10
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    score("LoL_Stats",num,"Custom_preprocessing",X_train,y_train,X_test,y_test)

In [16]:
num = 1
for df_i in [lol_stats_1,lol_stats_2,lol_stats_3]:
    no_preprocessing(df_i,num)
    fill_mean(df_i, num)
    fill_min(df_i,num)
    fill_max(df_i,num)
    num = num + 1

Scenario: No Preprocessing
Xgboost: 0.875
Random Forest Classifier: 0.7
KNeighbors Classifier: 0.8
Scenario: Fill Missing with mean
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.7959183673469388
KNeighbors Classifier: 0.7346938775510204
Scenario: Fill_missing_with_min
Xgboost: 0.9591836734693877
Random Forest Classifier: 0.7755102040816326
KNeighbors Classifier: 0.7142857142857143
Scenario: Fill_missing_with_max
Xgboost: 0.9591836734693877
Random Forest Classifier: 0.7959183673469388
KNeighbors Classifier: 0.7142857142857143
Scenario: No Preprocessing
Xgboost: 1.0
Random Forest Classifier: 0.675
KNeighbors Classifier: 0.75
Scenario: Fill Missing with mean
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.7959183673469388
KNeighbors Classifier: 0.7755102040816326
Scenario: Fill_missing_with_min
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.6938775510204082
KNeighbors Classifier: 0.7142857142857143
Scenario: Fill_missing_with_max
Xgboost: 0.9591836734693877
Rand

In [18]:
num = 1
for df_i in [lol_stats_1,lol_stats_2,lol_stats_3]:
    fill_regression(df_i,num)
    num = num + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Regression
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.8163265306122449
KNeighbors Classifier: 0.6938775510204082


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Regression
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.7755102040816326
KNeighbors Classifier: 0.7551020408163265


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Scenario: Regression
Xgboost: 0.9183673469387755
Random Forest Classifier: 0.7346938775510204
KNeighbors Classifier: 0.6938775510204082


In [19]:
num = 1
for df_i in [lol_stats_1,lol_stats_2,lol_stats_3]:
    standardize_scenario(df_i,num)
    normalize_scenario(df_i,num)
    normalize_and_remove_outliers(df_i,num)
    num = num + 1

Scenario: Standardize
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.7959183673469388
KNeighbors Classifier: 0.6938775510204082
Scenario: Normalizacja
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.7959183673469388
KNeighbors Classifier: 0.673469387755102
Scenario: Normalization_with_LOF
Xgboost: 0.9090909090909091
Random Forest Classifier: 0.5681818181818182
KNeighbors Classifier: 0.75
Scenario: Standardize
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.7959183673469388
KNeighbors Classifier: 0.6938775510204082
Scenario: Normalizacja
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.7959183673469388
KNeighbors Classifier: 0.7142857142857143
Scenario: Normalization_with_LOF
Xgboost: 0.9318181818181818
Random Forest Classifier: 0.6818181818181818
KNeighbors Classifier: 0.7045454545454546
Scenario: Standardize
Xgboost: 0.9387755102040817
Random Forest Classifier: 0.7755102040816326
KNeighbors Classifier: 0.7551020408163265
Scenario: Normalizacja
Xgboost: 

In [23]:
num = 1
for df_i in [lol_stats_1,lol_stats_2,lol_stats_3]:
    encode_categorical_scenario(df_i,num)
    encode_categorical_and_fill_missing(df_i,num)
    custom_scenario(df_i,num)
    num = num + 1

Scenario: Label_Encoding
Xgboost: 1.0
Random Forest Classifier: 0.775
KNeighbors Classifier: 0.775
Scenario: Label_encoding_+_fill_missing_mean
Xgboost: 1.0
Random Forest Classifier: 0.9183673469387755
KNeighbors Classifier: 0.7346938775510204
Scenario: Custom_preprocessing
Xgboost: 1.0
Random Forest Classifier: 0.8636363636363636
KNeighbors Classifier: 0.9090909090909091
Scenario: Label_Encoding
Xgboost: 1.0
Random Forest Classifier: 0.7
KNeighbors Classifier: 0.725
Scenario: Label_encoding_+_fill_missing_mean
Xgboost: 1.0
Random Forest Classifier: 0.9183673469387755
KNeighbors Classifier: 0.7959183673469388
Scenario: Custom_preprocessing
Xgboost: 1.0
Random Forest Classifier: 0.8837209302325582
KNeighbors Classifier: 1.0
Scenario: Label_Encoding
Xgboost: 1.0
Random Forest Classifier: 0.875
KNeighbors Classifier: 0.75
Scenario: Label_encoding_+_fill_missing_mean
Xgboost: 1.0
Random Forest Classifier: 0.8979591836734694
KNeighbors Classifier: 0.7142857142857143
Scenario: Custom_preproc