In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the data
df = pd.read_csv("game_details_opening_1990_ratings_included.csv")
display(df.head(20))







Unnamed: 0,id,White,Black,Result,Event,Date,Moves,white_rating,black_rating
0,1,?,?,0-1,?,1990-01-01,1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6 5. O-...,,
1,2,"Jorgensen, Jorgen","Christensen, Niels",1-0,DEN-chT 8990,1990-01-01,1. Nf3 Nf6 2. c4 c6 3. g3 d5 4. Bg2 e6 5. O-O ...,,
2,3,"Albrecht, Holger","Schmid, H.",1-0,OL Wuert 8990,1990-01-01,1. d4 d5 2. c4 c6 3. Nc3 Nf6 4. Nf3 e6 5. Bg5 ...,,
3,4,"Arnold, Thomas","Rabl, Joerg Stefan",1/2-1/2,OL Wuert 8990,1990-01-01,1. Nf3 Nf6 2. d4 d5 3. e3 c6 4. Bd3 Bg4 5. c4 ...,,
4,5,"Arnold, Thomas","Rohel, Markus",0-1,OL Wuert 8990,1990-01-01,1. Nf3 d6 2. d4 Bg4 3. e3 Nd7 4. Be2 c6 5. b3 ...,,
5,6,"Bantel, Thomas","Braeunlin, Klaus",1-0,OL Wuert 8990,1990-01-01,1. d4 { (kampflos) },,
6,7,"Bantel, Thomas","Carstens, Andreas",1/2-1/2,OL Wuert 8990,1990-01-01,1. d4 Nf6 2. c4 g6 3. Nc3 Bg7 4. e4 d6 5. Be2 ...,,
7,8,"Bantel, Thomas","Lach, Bernhard",0-1,OL Wuert 8990,1990-01-01,1. d4 d5 2. Nf3 Nc6 3. c4 e5 4. dxe5 d4 5. g3 ...,,
8,9,"Bantel, Thomas","Laengl, Juergen",1-0,OL Wuert 8990,1990-01-01,1. d4 d5 2. Nf3 Nf6 3. c4 c6 4. Nc3 dxc4 5. a4...,,
9,10,"Bantel, Thomas","Lenz, Juergen",1/2-1/2,OL Wuert 8990,1990-01-01,1. d4 Nf6 2. Nf3 d6 3. g3 Bf5 4. Bg2 c6 5. O-O...,,


In [None]:
# We have a DataFrame with NaN for several ratings , as it is not systematically populated...
# So we should remove correspondings rows for our ML model...
# And do the same for the 'Moves' column as well...
df_clean = df.dropna(subset=['white_rating', 'black_rating', 'Moves'])

# I thought that i have removed {} text already , so i do it again... 
df_clean['Moves'] = df_clean['Moves'].str.replace(r'\{.*?\}', '', regex=True).str.strip()

# We will remove Result later as we need it for computing target column ,
# (We want to predict the Result) And we also remove string columns.
df_clean = df_clean.drop(columns=['id', 'Event', 'Date', 'White', 'Black'])

display(df_clean.head(20))
display(len(df_clean)) # 16535

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Moves'] = df_clean['Moves'].str.replace(r'\{.*?\}', '', regex=True).str.strip()


Unnamed: 0,Result,Moves,white_rating,black_rating
12,1-0,1. e4 c6 2. d4 d5 3. Nd2 dxe4 4. Nxe4 Nf6 5. N...,2285.0,2405.0
13,0-1,1. e4 e5 2. Nf3 Nc6 3. Bb5 f5 4. Nc3 Nd4 5. Nx...,2285.0,2405.0
14,0-1,1. a4,2285.0,2405.0
15,0-1,1. e4 e5 2. Nf3 Nc6 3. Bb5 Nf6 4. d3 Nd4 5. Nx...,2285.0,2405.0
22,1/2-1/2,1. c4 f5 2. g3 Nf6 3. Bg2 g6 4. Nc3 Bg7 5. e3 ...,2275.0,2255.0
23,1-0,1. c4 e5 2. Nc3 d6 3. g3 f5 4. Bg2 Nf6 5. e3 B...,2275.0,2255.0
24,1/2-1/2,1. c4 Nf6 2. Nc3 e6 3. Nf3 c5 4. g3 Nc6 5. Bg2...,2275.0,2255.0
31,1-0,1. Nf3 Nc6 2. d4 d5 3. c4 Bg4 4. Nc3 dxc4 5. d...,2335.0,2365.0
32,1-0,1. e4 c5 2. Nf3 Nc6 3. d4 cxd4 4. Nxd4 Qb6 5. ...,2335.0,2365.0
33,1/2-1/2,1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6 5. O-...,2335.0,2365.0


16535

In [3]:
# We add the 'target' column , and remove the 'Result' column :
df_clean['target'] = df_clean['Result'].apply(lambda x: 1 if x == '1-0' else (0 if x == '1/2-1/2' else -1))
df_clean = df_clean.drop(columns=['Result'])

In [4]:
df_clean.dtypes

Moves            object
white_rating    float64
black_rating    float64
target            int64
dtype: object

In [5]:
df_clean.head()

Unnamed: 0,Moves,white_rating,black_rating,target
12,1. e4 c6 2. d4 d5 3. Nd2 dxe4 4. Nxe4 Nf6 5. N...,2285.0,2405.0,1
13,1. e4 e5 2. Nf3 Nc6 3. Bb5 f5 4. Nc3 Nd4 5. Nx...,2285.0,2405.0,-1
14,1. a4,2285.0,2405.0,-1
15,1. e4 e5 2. Nf3 Nc6 3. Bb5 Nf6 4. d3 Nd4 5. Nx...,2285.0,2405.0,-1
22,1. c4 f5 2. g3 Nf6 3. Bg2 g6 4. Nc3 Bg7 5. e3 ...,2275.0,2255.0,0


In [6]:
len(df_clean)

16535

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Extract the first moves (e.g., first 8 words to capture the initial 4 moves) from the Moves column
df_clean['opening'] = df_clean['Moves'].apply(lambda x: ' '.join(x.split()[:8]) if isinstance(x, str) else '')

# Vectorize the initial moves using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_opening = vectorizer.fit_transform(df_clean['opening'])

# Calculate the ELO difference as a new feature
df_clean['elo_difference'] = df_clean['white_rating'] - df_clean['black_rating']

# Construct the training matrix by combining the opening vectors with other numeric features
X = pd.concat([
    pd.DataFrame(X_opening.toarray()),  # vectorized opening moves
    df_clean[['white_rating', 'black_rating', 'elo_difference']].reset_index(drop=True)  # other numeric features
], axis=1)

# Convert all column names to strings to avoid errors in the model
X.columns = X.columns.astype(str)
y = df_clean['target']  # Target variable (1 for White win, -1 for Black win, 0 for draw)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
#accuracy = accuracy_score(y_test, y_pred)
#print(f"Model accuracy: {accuracy * 100:.2f}%")


Model accuracy: 37.62%


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Evaluate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")



Accuracy: 37.62%
Precision: 0.37
Recall: 0.38
F1 Score: 0.37
Confusion Matrix:
[[199 278 308]
 [218 486 527]
 [262 470 559]]


In [8]:
df.columns

Index(['id', 'White', 'Black', 'Result', 'Event', 'Date', 'Moves',
       'white_rating', 'black_rating'],
      dtype='object')

In [9]:
df.head(20)

Unnamed: 0,id,White,Black,Result,Event,Date,Moves,white_rating,black_rating
0,1,?,?,0-1,?,1990-01-01,1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6 5. O-...,,
1,2,"Jorgensen, Jorgen","Christensen, Niels",1-0,DEN-chT 8990,1990-01-01,1. Nf3 Nf6 2. c4 c6 3. g3 d5 4. Bg2 e6 5. O-O ...,,
2,3,"Albrecht, Holger","Schmid, H.",1-0,OL Wuert 8990,1990-01-01,1. d4 d5 2. c4 c6 3. Nc3 Nf6 4. Nf3 e6 5. Bg5 ...,,
3,4,"Arnold, Thomas","Rabl, Joerg Stefan",1/2-1/2,OL Wuert 8990,1990-01-01,1. Nf3 Nf6 2. d4 d5 3. e3 c6 4. Bd3 Bg4 5. c4 ...,,
4,5,"Arnold, Thomas","Rohel, Markus",0-1,OL Wuert 8990,1990-01-01,1. Nf3 d6 2. d4 Bg4 3. e3 Nd7 4. Be2 c6 5. b3 ...,,
5,6,"Bantel, Thomas","Braeunlin, Klaus",1-0,OL Wuert 8990,1990-01-01,1. d4 { (kampflos) },,
6,7,"Bantel, Thomas","Carstens, Andreas",1/2-1/2,OL Wuert 8990,1990-01-01,1. d4 Nf6 2. c4 g6 3. Nc3 Bg7 4. e4 d6 5. Be2 ...,,
7,8,"Bantel, Thomas","Lach, Bernhard",0-1,OL Wuert 8990,1990-01-01,1. d4 d5 2. Nf3 Nc6 3. c4 e5 4. dxe5 d4 5. g3 ...,,
8,9,"Bantel, Thomas","Laengl, Juergen",1-0,OL Wuert 8990,1990-01-01,1. d4 d5 2. Nf3 Nf6 3. c4 c6 4. Nc3 dxc4 5. a4...,,
9,10,"Bantel, Thomas","Lenz, Juergen",1/2-1/2,OL Wuert 8990,1990-01-01,1. d4 Nf6 2. Nf3 d6 3. g3 Bf5 4. Bg2 c6 5. O-O...,,
