# Import Packages

In [14]:
# Packages
import pandas as pd
import numpy as np

# Pipelines
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle

# Import Data

In [15]:
# Import the CSV to fighter and fight
data = pd.read_csv('../ufc_data_raw_main/data.csv')

In [16]:
# Check columns and example values.
data_cols = list(data.columns)
for i in data_cols:
    print(f"{i}: {data[i][1]}")

R_fighter: Trevin Giles
B_fighter: Roman Dolidze
Referee: Herb Dean
date: 2021-03-20
location: Las Vegas, Nevada, USA
Winner: Red
title_bout: False
weight_class: Middleweight
B_avg_KD: 0.5
B_avg_opp_KD: 0.0
B_avg_SIG_STR_pct: 0.66
B_avg_opp_SIG_STR_pct: 0.305
B_avg_TD_pct: 0.3
B_avg_opp_TD_pct: 0.5
B_avg_SUB_ATT: 1.5
B_avg_opp_SUB_ATT: 0.0
B_avg_REV: 0.0
B_avg_opp_REV: 0.0
B_avg_SIG_STR_att: 65.5
B_avg_SIG_STR_landed: 35.0
B_avg_opp_SIG_STR_att: 50.0
B_avg_opp_SIG_STR_landed: 16.5
B_avg_TOTAL_STR_att: 113.5
B_avg_TOTAL_STR_landed: 68.5
B_avg_opp_TOTAL_STR_att: 68.5
B_avg_opp_TOTAL_STR_landed: 29.0
B_avg_TD_att: 2.5
B_avg_TD_landed: 1.5
B_avg_opp_TD_att: 0.5
B_avg_opp_TD_landed: 0.5
B_avg_HEAD_att: 46.0
B_avg_HEAD_landed: 20.0
B_avg_opp_HEAD_att: 36.0
B_avg_opp_HEAD_landed: 7.5
B_avg_BODY_att: 12.0
B_avg_BODY_landed: 8.0
B_avg_opp_BODY_att: 8.0
B_avg_opp_BODY_landed: 3.0
B_avg_LEG_att: 7.5
B_avg_LEG_landed: 7.0
B_avg_opp_LEG_att: 6.0
B_avg_opp_LEG_landed: 6.0
B_avg_DISTANCE_att: 58.0
B_

# Drop Cols and Save to CSV

In [17]:
# Drop uneccesary columns (r and b fighter not needed for training. Ref, date, location should contain no info).
data.drop(columns=['R_fighter','B_fighter','Referee','date','location'], inplace=True)

# Check columns and example values.
data_cols = list(data.columns)
for i in data_cols:
    print(f"{i}: {data[i][1]}")

Winner: Red
title_bout: False
weight_class: Middleweight
B_avg_KD: 0.5
B_avg_opp_KD: 0.0
B_avg_SIG_STR_pct: 0.66
B_avg_opp_SIG_STR_pct: 0.305
B_avg_TD_pct: 0.3
B_avg_opp_TD_pct: 0.5
B_avg_SUB_ATT: 1.5
B_avg_opp_SUB_ATT: 0.0
B_avg_REV: 0.0
B_avg_opp_REV: 0.0
B_avg_SIG_STR_att: 65.5
B_avg_SIG_STR_landed: 35.0
B_avg_opp_SIG_STR_att: 50.0
B_avg_opp_SIG_STR_landed: 16.5
B_avg_TOTAL_STR_att: 113.5
B_avg_TOTAL_STR_landed: 68.5
B_avg_opp_TOTAL_STR_att: 68.5
B_avg_opp_TOTAL_STR_landed: 29.0
B_avg_TD_att: 2.5
B_avg_TD_landed: 1.5
B_avg_opp_TD_att: 0.5
B_avg_opp_TD_landed: 0.5
B_avg_HEAD_att: 46.0
B_avg_HEAD_landed: 20.0
B_avg_opp_HEAD_att: 36.0
B_avg_opp_HEAD_landed: 7.5
B_avg_BODY_att: 12.0
B_avg_BODY_landed: 8.0
B_avg_opp_BODY_att: 8.0
B_avg_opp_BODY_landed: 3.0
B_avg_LEG_att: 7.5
B_avg_LEG_landed: 7.0
B_avg_opp_LEG_att: 6.0
B_avg_opp_LEG_landed: 6.0
B_avg_DISTANCE_att: 58.0
B_avg_DISTANCE_landed: 30.0
B_avg_opp_DISTANCE_att: 48.0
B_avg_opp_DISTANCE_landed: 15.5
B_avg_CLINCH_att: 0.5
B_avg_CLI

In [18]:
data.shape

(6012, 139)

In [19]:
# Save dataset to Data.CSV for use in model
data.to_csv("../data/data.csv", index=False)

# Imputing, Encoding, Scaling Pipeline

In [20]:
# Create Imputing and Scaling Pipelines
num_transformer = make_pipeline(SimpleImputer(strategy="median", 
                                              missing_values = np.nan),
                                RobustScaler())
cat_transformer = make_pipeline(SimpleImputer(strategy="constant",
                                              fill_value="unknown", 
                                              missing_values = np.nan),
                               OneHotEncoder(drop='if_binary', sparse_output=False))

# Select num_col and cat_col
num_col = make_column_selector(dtype_include=['number'])
cat_col = make_column_selector(dtype_include=['object','bool'])

# Transform the Columns
transformer = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

In [21]:
# Create preprocessing pipeline
preprocessor = make_pipeline(transformer)
preprocessor

# Base Model

In [22]:
# Create X and y variables
X = data.drop(columns=['Winner'])
y = data['Winner']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33, 
                                                    random_state=42,
                                                    stratify=y)

# Preprocess
X_train_pre = pd.DataFrame(preprocessor.fit_transform(X_train))
X_test_pre = pd.DataFrame(preprocessor.transform(X_test))

# Rename columns
X_train_pre.columns = preprocessor.get_feature_names_out()
X_test_pre.columns = preprocessor.get_feature_names_out()

In [23]:
model = LogisticRegression(max_iter=1000)

pipeline = make_pipeline(preprocessor,
                        model)
pipeline

In [24]:
# Fit Model
pipeline.fit(X_train_pre, y_train)

# Score Model
pipeline.score(X_test_pre, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6678427419354839