In [167]:
import pandas as pd
import numpy as np
import pickle as pkl
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input,GRU, SimpleRNN
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ReduceLROnPlateau


In [168]:
data = pd.read_csv('games.csv')

In [169]:
data.duplicated().sum()

429

In [170]:
data = data.drop_duplicates()

In [171]:
data.winner.value_counts()

winner
white    9792
black    8919
draw      918
Name: count, dtype: int64

In [140]:
data = data.drop(['white_id', 'black_id', 'white_rating', 'black_rating', 'id', 'rated'], axis=1)

In [172]:
data['victory_status'].value_counts()

victory_status
resign       10926
mate          6188
outoftime     1641
draw           874
Name: count, dtype: int64

In [141]:
data = pd.get_dummies(data, columns=['victory_status'], prefix='victory_status',  dtype=int)

In [142]:
data = data.drop(['created_at', 'last_move_at', 'increment_code', 'opening_name'], axis=1)

In [143]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19629 entries, 0 to 20057
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   turns                     19629 non-null  int64 
 1   winner                    19629 non-null  object
 2   moves                     19629 non-null  object
 3   opening_eco               19629 non-null  object
 4   opening_ply               19629 non-null  int64 
 5   victory_status_draw       19629 non-null  int64 
 6   victory_status_mate       19629 non-null  int64 
 7   victory_status_outoftime  19629 non-null  int64 
 8   victory_status_resign     19629 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 1.5+ MB


In [144]:
top_n = 10  # Adjust as needed
top_categories = data['opening_eco'].value_counts().nlargest(top_n).index
data['opening_eco_processed'] = np.where(data['opening_eco'].isin(top_categories), data['opening_eco'], 'Other')

In [145]:
data = pd.get_dummies(data, columns=['opening_eco_processed'], prefix='opening_eco', dtype=int)

In [146]:
data = data.drop('opening_eco', axis=1)

In [147]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19629 entries, 0 to 20057
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   turns                     19629 non-null  int64 
 1   winner                    19629 non-null  object
 2   moves                     19629 non-null  object
 3   opening_ply               19629 non-null  int64 
 4   victory_status_draw       19629 non-null  int64 
 5   victory_status_mate       19629 non-null  int64 
 6   victory_status_outoftime  19629 non-null  int64 
 7   victory_status_resign     19629 non-null  int64 
 8   opening_eco_A00           19629 non-null  int64 
 9   opening_eco_A40           19629 non-null  int64 
 10  opening_eco_B00           19629 non-null  int64 
 11  opening_eco_B01           19629 non-null  int64 
 12  opening_eco_B20           19629 non-null  int64 
 13  opening_eco_C00           19629 non-null  int64 
 14  opening_eco_C20           1

In [148]:
moves = np.array(data.query("winner != 'draw'")['moves'])

In [155]:
moves_with_draw = np.array(data['moves'])

In [149]:
labels = np.array(data.query("winner != 'draw'")['winner'].apply(lambda x: 1 if x =='white' else 0))
labels

array([1, 0, 1, ..., 1, 1, 0])

In [156]:
labels_with_draw = np.array(data['winner'].apply(lambda x: 2 if x == 'draw' else 1 if x == 'white' else 0))
labels_with_draw

array([1, 0, 1, ..., 1, 1, 0])

In [16]:
all_moves = set()
for moves_list in moves:
    for move in moves_list.split(' '):
        if move not in all_moves:
            all_moves.add(move)


vocab_length = len(all_moves)
vocab_length

4373

In [None]:
all_moves_with_draw = set()
for moves_list in moves_with_draw:
    for move in moves_list.split(' '):
        if move not in all_moves:
            all_moves.add(move)


vocab_length_with_draw = len(all_moves_with_draw)
vocab_length_with_draw

In [17]:
max_moves = data.turns.max()
max_moves

349

In [18]:
print(vocab_length)
print(max_moves)

4373
349


In [133]:
tokenizer = Tokenizer(num_words=vocab_length)
tokenizer.fit_on_texts(moves)

sequences = tokenizer.texts_to_sequences(moves)
word_index = tokenizer.word_index

model_inputs = pad_sequences(sequences, maxlen=max_moves)

pkl.dump(tokenizer, open('trained_models/tokenizer.pkl', 'wb'))

In [160]:
model_inputs.shape, labels.shape

((18711, 349), (18711,))

In [157]:
tokenizer_with_draw = Tokenizer(num_words=vocab_length_with_draw)
tokenizer_with_draw.fit_on_texts(moves_with_draw)

sequences_with_draw = tokenizer_with_draw.texts_to_sequences(moves_with_draw)
word_index_with_draw = tokenizer_with_draw.word_index

model_inputs_with_draw = pad_sequences(sequences_with_draw, maxlen=max_moves)

#pkl.dump(tokenizer, open('trained_models/tokenizer.pkl', 'wb'))

In [161]:
model_inputs_with_draw.shape, labels_with_draw.shape

((19629, 349), (19629,))

In [21]:
X_train,X_test,y_train,y_test = train_test_split(model_inputs, labels, test_size=0.2, random_state=42)

In [162]:
X_train_with_draw,X_test_with_draw,y_train_with_draw,y_test_with_draw = train_test_split(model_inputs_with_draw, labels_with_draw, test_size=0.2, random_state=42)

In [22]:
max_moves

349

In [164]:
model = Sequential()
model.add(Embedding(input_dim=vocab_length, output_dim=256, input_length=max_moves))
model.add(GRU(units=256))
model.add(Dense(1, activation='sigmoid'))

In [165]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test), verbose=2, callbacks=ReduceLROnPlateau())

Epoch 1/3
468/468 - 286s - loss: 0.6977 - accuracy: 0.6941 - val_loss: 0.5533 - val_accuracy: 0.7168 - lr: 0.0010 - 286s/epoch - 611ms/step
Epoch 2/3
468/468 - 246s - loss: 0.4885 - accuracy: 0.7656 - val_loss: 0.4474 - val_accuracy: 0.7943 - lr: 0.0010 - 246s/epoch - 527ms/step
Epoch 3/3
468/468 - 274s - loss: 0.3454 - accuracy: 0.8489 - val_loss: 0.3268 - val_accuracy: 0.8587 - lr: 0.0010 - 274s/epoch - 586ms/step


In [30]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)



In [131]:
pkl.dump(model, open('trained_models/model.pkl', 'wb'))

In [163]:
model_with_draw = Sequential()
model_with_draw.add(Embedding(input_dim=vocab_length_with_draw, output_dim=256, input_length=max_moves))
model_with_draw.add(GRU(units=256))
model_with_draw.add(Dense(3, activation='softmax'))

In [None]:
model_with_draw.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history_with_draw = model_with_draw.fit(X_train_with_draw, y_train_with_draw, epochs=3, batch_size=32, validation_data=(X_test_with_draw, y_test_with_draw), verbose=2, callbacks=ReduceLROnPlateau())