In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from prepared_data.get_prepared_data import get_prepared_data
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from prepared_data.prepared_test_results import result_predict_prob_to_dataFrame, not_duplicate_elements_in_dataframes

import tensorflow as tf
from prepared_data.train_model import split_features_target_and_map_target

Data loading and preparation

In [None]:
train_org = pd.read_csv('data/train.csv')
train, league_mean_ratting_all, teams_ratting_all = get_prepared_data(train_org,number_of_history_matches=4,map_target=False)

test_org = pd.read_csv('data/test.csv')
test,_,_ = get_prepared_data(test_org, 4,league_mean_ratting_all, teams_ratting_all,map_target=False)


Model

In [4]:
def build_and_compile_model(shape,n_neurons, dropout, learning_rate):
    model = tf.keras.Sequential([
      tf.keras.layers.Dense(
          n_neurons, activation='relu',
          input_shape=shape),
      tf.keras.layers.Dropout(dropout),
      tf.keras.layers.Dense(3)
     ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

    return model

Train model

In [5]:
from datetime import datetime
from prepared_data.train_model import callbacks

dropout = 0.3
n_neurons = 16
learning_rate = 0.001
batch_size = None

scaler = MinMaxScaler()

train_features, target = split_features_target_and_map_target(train)

X_train  = scaler.fit_transform(train_features)

dnn_model = build_and_compile_model( (train_features.shape[-1],),n_neurons,dropout,learning_rate)

model_name = f'final/final' + datetime.now().strftime("%Y:%m:%d-%H:%M:%S")

early_stop, reduce_lr, tensorboard_callback, checkpoint_callback =  callbacks(model_name)

history = dnn_model.fit(
    X_train,
    target,
    verbose=0, epochs=20,
   callbacks=[reduce_lr,checkpoint_callback, tensorboard_callback],
    batch_size = batch_size
)

2022-06-04 12:49:05.682567: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-06-04 12:49:05.682595: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-04 12:49:05.682616: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (Zenon): /proc/driver/nvidia/version does not exist
2022-06-04 12:49:05.682976: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Predict test

In [6]:
filepath = 'saved_model/checkpoint/final_result_train/2022:05:07-00:56:29_epoch18-train1.011.hdf5'

dnn_model = build_and_compile_model( (train_features.shape[-1],),n_neurons,dropout,learning_rate)

dnn_model.load_weights(filepath)

probability_model = tf.keras.Sequential([dnn_model,
                                         tf.keras.layers.Softmax()])

X_test = scaler.transform(test)

test_result = probability_model.predict(X_test)

test_result = result_predict_prob_to_dataFrame(test_result, test)
test_result

Unnamed: 0,id,away,draw,home
0,18300194,0.297770,0.238329,0.463901
1,18244932,0.278344,0.214762,0.506894
2,18300196,0.292891,0.225543,0.481566
3,18139735,0.394708,0.238409,0.366883
4,18137425,0.394579,0.282672,0.322749
...,...,...,...,...
58999,18125501,0.166454,0.324038,0.509507
59000,18070396,0.274875,0.368509,0.356615
59001,18094517,0.301714,0.328158,0.370128
59002,18163535,0.256769,0.309018,0.434213


## Load missing data with number_of_history_matches=1

In [7]:
train_org = pd.read_csv('data/train.csv')
train,_,_ = get_prepared_data(train_org, number_of_history_matches=1)
test_org = pd.read_csv('data/test.csv')
test_history_1,_,_ = get_prepared_data(test_org, number_of_history_matches=1,league_mean_ratting_all=league_mean_ratting_all,teams_mean_ratting_all= teams_ratting_all)
test_history_1 = not_duplicate_elements_in_dataframes(test_history_1.reset_index(), test)
test_history_1

  exec(code_obj, self.user_global_ns, self.user_ns)


percent of object with nan value and orginals:  0.00, 98.210712
percent of object with nan value and orginals:  0.00, 98.786979
Number of not duplicate elements  9359


Unnamed: 0_level_0,is_cup,home_team_history_match_date_1,home_team_history_is_play_home_1,home_team_history_is_cup_1,home_team_history_goal_1,home_team_history_opponent_goal_1,home_team_history_rating_1,home_team_history_opponent_rating_1,away_team_history_match_date_1,away_team_history_is_play_home_1,...,away_team_history_gol_difference_1,away_team_mean_regeneration_time,league_id_ratting,sum_history_targets,home_team_history_target_1_-1.0,home_team_history_target_1_0.0,home_team_history_target_1_1.0,away_team_history_target_1_-1.0,away_team_history_target_1_0.0,away_team_history_target_1_1.0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18395531,1,388,1.0,0.0,2.0,1.0,11.078400,3.724950,74,0.0,...,3.0,74.0,6.801392,2.0,0,0,1,0,0,1
18395534,1,96,0.0,0.0,1.0,2.0,6.489692,4.418742,77,0.0,...,2.0,77.0,6.801392,0.0,1,0,0,0,0,1
18395559,1,101,0.0,0.0,1.0,3.0,6.168855,5.675820,98,1.0,...,2.0,98.0,6.801392,0.0,1,0,0,0,0,1
18395595,1,72,1.0,0.0,1.0,0.0,10.392432,3.526532,95,1.0,...,3.0,95.0,6.801392,2.0,0,0,1,0,0,1
18395597,1,76,1.0,0.0,2.0,2.0,9.860605,3.190473,77,0.0,...,0.0,77.0,6.801392,0.0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18433161,1,74,0.0,1.0,0.0,3.0,5.328017,12.617483,1317,0.0,...,0.0,1317.0,8.857095,-1.0,1,0,0,0,1,0
18430451,1,72,1.0,1.0,1.0,0.0,6.779631,10.878485,69,1.0,...,-4.0,69.0,8.857095,0.0,0,0,1,1,0,0
18433160,1,78,0.0,0.0,1.0,4.0,5.731083,13.314733,5662,1.0,...,-1.0,5662.0,8.857095,-2.0,1,0,0,1,0,0
18430453,1,69,0.0,1.0,0.0,0.0,7.629958,5.679331,72,1.0,...,2.0,72.0,8.857095,1.0,0,1,0,0,0,1


Train LogisticRegression model

In [None]:
target = train['target']
X_train_org = train.drop('target', axis=1)
pipe = make_pipeline(MinMaxScaler(),
                     LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr', max_iter=1000, n_jobs=-1))
pipe.fit(X_train_org, target)

Predict

In [8]:
test_result_history_1 = pipe.predict_proba(test_history_1)
test_result_history_1 = result_predict_prob_to_dataFrame(test_result_history_1, test_history_1)
test_result_history_1

Unnamed: 0,id,away,draw,home
0,18395531,0.326631,0.217357,0.456012
1,18395534,0.401950,0.226799,0.371251
2,18395559,0.419427,0.221510,0.359063
3,18395595,0.445531,0.223663,0.330806
4,18395597,0.327524,0.222984,0.449493
...,...,...,...,...
9354,18433161,0.466112,0.162647,0.371241
9355,18430451,0.236442,0.135125,0.628432
9356,18433160,0.412746,0.161971,0.425283
9357,18430453,0.476034,0.162803,0.361163


Append missing predicted test

In [9]:
from prepared_data.prepared_test_results import append_test_results

test_result_learn = append_test_results(test_result_history_1, test_result)
test_result_learn

Unnamed: 0,id,away,draw,home
0,18395531,0.326631,0.217357,0.456012
1,18395534,0.401950,0.226799,0.371251
2,18395559,0.419427,0.221510,0.359063
3,18395595,0.445531,0.223663,0.330806
4,18395597,0.327524,0.222984,0.449493
...,...,...,...,...
68358,18125501,0.166454,0.324038,0.509507
68359,18070396,0.274875,0.368509,0.356615
68360,18094517,0.301714,0.328158,0.370128
68361,18163535,0.256769,0.309018,0.434213


Save predicted test

In [10]:
from prepared_data.prepared_test_results import completed_test_result_and_save

result_all = completed_test_result_and_save(test_org, test_result_learn, 'final_dnn_04_+_1_fit_on_the_full_train')
result_all.shape[0]  == test_org.shape[0]


Number of not duplicate elements  4348


True