In [23]:
import pandas as pd
import skrub
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#from xgboost import XGBRegressor
from math import sqrt

from sklearn.preprocessing import OneHotEncoder

# Load the data
games_df = pd.read_csv("games.csv")
turns_df = pd.read_csv("turns.csv")
data_df = pd.read_csv("train.csv")

games = skrub.var("games", games_df)
turns = skrub.var("turns", turns_df)
data = skrub.var("data", data_df).skb.mark_as_X().skb.subsample(n=100)


In [5]:
def sum_first_five(series):
    return sum(series.values[::-1][:5])


def replace_winner(row, data):
    """Set the value of winner to 1 if the player won, -1 if the lost, or 0 if it was a draw."""
    # Locate opponent as the row with the same game_id but different nickname.
    opponent_row = data.loc[(data.game_id == row.loc["game_id"]) & (data.nickname != row.loc["nickname"])]

    # Compare scores. Set the winner to 1, the loser to -1 and if a tie, give both 0.
    if (row.loc["score"] > opponent_row["score"].values).all():
        row.loc["winner"] = 1
    elif (row.loc["score"] < opponent_row["score"].values).all():
        row.loc["winner"] = -1
    else:
        row.loc["winner"] = 0
    return row


def replace_first(row):
    """Set the value in column first to 1 if the player went first in their game,
    or to 0 if they went second."""
    if row.loc["first"] == row.loc["nickname"]:
        row.loc["first"] = 1
    else:
        row.loc["first"] = 0
    return row


def relabel_values(data):
    def relabel(row):
        row = replace_winner(row, data)
        row = replace_first(row)
        return row
    return relabel

In [6]:
nominal_features = ["time_control_name", "game_end_reason", "lexicon", "rating_mode"]
non_informative_features = ["created_at", "nickname", "game_id"]

total_turns = turns.groupby(["game_id", "nickname"]).turn_number.count()
max_points = turns.groupby(["game_id", "nickname"]).points.max()
min_points = turns.groupby(["game_id", "nickname"]).points.min()
first_five_turn_point_sum = turns.groupby(["game_id", "nickname"]).points.agg(sum_first_five)

In [15]:
game_player_data = total_turns.reset_index()
game_player_data = game_player_data.rename(columns={"turn_number": "total_turns"})

game_player_data = game_player_data.assign(
    first_five_turns_points=first_five_turn_point_sum.reset_index()["points"],
    max_points_turn=max_points.reset_index()["points"],
    min_points_turn=min_points.reset_index()["points"],
)
game_player_data = game_player_data.assign(
    max_min_difference=game_player_data.max_points_turn - game_player_data.min_points_turn
)

game_player_data = game_player_data.join(games.set_index("game_id"), how="left", on="game_id")
game_player_data = game_player_data.assign(
    time_used=game_player_data.game_duration_seconds / game_player_data.initial_time_seconds
)

In [18]:
data = data.merge(game_player_data, how="left", left_on=["game_id", "nickname"], right_on=["game_id", "nickname"])
data = data.assign(
    points_per_turn=data.score / data.total_turns,
    points_per_second=data.score / data.game_duration_seconds,
    time_used=data.game_duration_seconds / data.initial_time_seconds,
)

In [20]:
X = data.skb.apply_func(lambda df: df.apply(relabel_values(df), axis=1))
y = X.pop("rating")

In [26]:
from sklearn.compose import ColumnTransformer

pass_through_features = ['score', 'total_turns', 'first_five_turns_points', 'max_points_turn', 'min_points_turn', 'max_min_difference', 'first', 'winner', 'initial_time_seconds', 'increment_seconds', 'max_overtime_minutes', 'game_duration_seconds', 'time_used', 'points_per_turn', 'points_per_second']
nominal_features = ["time_control_name", "game_end_reason", "lexicon", "rating_mode"]

encoder = ColumnTransformer(
    transformers=[
        ("ordinal", OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_features),
        ("passthrough", "passthrough", pass_through_features),
    ]
)

X = X.skb.apply(encoder)
