In [None]:
import sys
sys.path.append('..')

import pandas as pd
import sqlite3
import numpy as np
from typing import Any, Dict, List, Tuple
from itertools import chain
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from util.constants import *
from util.common import *

from collections import Counter
import numpy as np
import pandas as pd
from sklearn.utils import compute_class_weight
from typing import Any, Dict, List, Optional, Tuple

import tensorflow as tf

In [None]:
# 连接sqlite3
conn = sqlite3.connect(f"file:../data/db/soccer.db?mode=rw", uri=True) 
cur = conn.cursor()

In [None]:
# 定义常量
_seed = 99
_timesteps = 10
_epochs = 2
seasons = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
_features = FEATURES_COMMON + FEATURES_WD
_scalers = {f: MinMaxScaler(feature_range=(0, 1)) for f in FEATURES_TO_SCALE if f in _features}


In [None]:
def query_teams_names() -> pd.DataFrame:
        """
        Queries teams' names from db.
        Ordered by team name.

        :return: Dataframe of all teams names in db.
        """
        return pd.read_sql("""
            SELECT t.name  
            FROM Teams t
            ORDER BY t.name
            """, conn)

# 查询fixtures数据
def query_fixtures_data(seasons: List[int]) -> pd.DataFrame:
        df = pd.read_sql("""
            SELECT f.id, f.date, f.season, f.league, 
                   t1.name AS home, t2.name AS away, f.home_goals, f.away_goals, 
                   f.oddsDC_1X AS home_odds_wd, f.oddsDC_X2 AS away_odds_wd,
                   ts1.rating AS home_rating, ts2.rating AS away_rating,
                   ts1.errors AS home_errors, ts2.errors AS away_errors, 
                   ts1.red_cards AS home_red_cards, ts2.red_cards AS away_red_cards,
                   ts1.shots AS home_shots, ts2.shots AS away_shots,
                   ts1.opponent_goals AS home_opponent_goals, ts1.opponent_shots AS home_opponent_shots,
                   ts1.opponent_rating AS home_opponent_rating, ts2.opponent_rating AS away_opponent_rating,
                   ts2.opponent_goals As away_opponent_goals, ts2.opponent_shots AS away_opponent_shots 
            FROM Fixtures f
            JOIN Teams t1 ON f.homeTeamID = t1.id
            JOIN Teams t2 ON f.awayTeamID = t2.id
            JOIN TeamStats ts1 ON f.homeStatsID = ts1.id
            JOIN TeamStats ts2 ON f.awayStatsID = ts2.id
            WHERE f.season IN ({})
            ORDER BY f.date, f.id
            """.format(",".join("?" * len(seasons))),
                         conn, params=seasons)

        return df

def query_team_data(seasons: List[int], params: Tuple[Any, ...]) -> pd.DataFrame:
        """
        Queries fixtures data for a single team within given seasons.

        :param seasons: Seasons to query.
        :param params: Params for query.
        :return: Team fixtures df.
        """
        df = pd.read_sql("""
            SELECT f.id, f.date, f.season, f.league, f.homeTeamID, f.awayTeamID,
                   t1.name AS home, t2.name AS away, f.home_goals, f.away_goals, f.winner,
                   ts.rating, ts.goals, ts.errors, ts.red_cards, ts.shots, f.oddsDC_1X, f.oddsDC_X2,
                   ts.opponent_goals, ts.opponent_shots, ts.opponent_rating 
            FROM TeamStats ts
            JOIN Fixtures f ON f.id = ts.fixtureID 
            JOIN Teams t1 ON f.homeTeamID = t1.id
            JOIN Teams t2 ON f.awayTeamID = t2.id
            WHERE ts.teamID = ? AND (f.homeTeamID = ? OR f.awayTeamID = ?) AND
                  f.season IN ({})
            ORDER BY f.date, f.id
            """.format(",".join("?" * len(seasons))),
            conn, params=params)

        return df

def query_teams_ids_names_tuples() -> Dict[str, int]:
        df = pd.read_sql("""
            SELECT t.id, t.name
            FROM Teams t
            ORDER BY t.id
            """, conn)

        return dict(zip(df["name"], df["id"]))

# 检查缺失列
def _check_missing_columns(df: pd.DataFrame) -> None:
        if any([c not in df.columns for c in REQUIRED_COLUMNS]):
            raise ValueError("Missing columns in dataset."
                             f"Columns: {df.columns}"
                             f"Required: {REQUIRED_COLUMNS}")

# 检查缺失值
def _check_nan_values(df: pd.DataFrame, teams_fixtures_ids: Dict[str, List[int]]) -> pd.DataFrame:
        teams_fixtures_lastid = {k: v[-1] for k, v in teams_fixtures_ids.items()}

        mask_lastids = df["id"].isin(set(teams_fixtures_lastid.values()))
        df_except_lastid = df[~mask_lastids].copy()
        df_lastid_only = df[mask_lastids][REQUIRED_TARGET_COLUMNS]

        # Check missing values in data except for teams' last matches (targets)
        if df_except_lastid.isna().any().any():
            print("Missing values found in the dataset:")
            print(df_except_lastid.isna().sum())

            # If there are still nans then there are another missing data
            if df_except_lastid.isna().any().any():
                print("There are still missing values in the dataset:")
                print(df_except_lastid.isna().sum())
                raise ValueError("Dataset contains some missing values.")

        # Check if last matches (targets) contain any nans for required columns
        if df_lastid_only.isna().any().any():
            print(df_lastid_only.isna().sum())
            raise ValueError("Dataset contains some missing target values.")

        return df    


def _mask_out_dataset(df: pd.DataFrame, _train_fixtures_ids, _test_fixtures_ids, _predict_fixtures_ids) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        mask_train = df["id"].isin(set(chain(*_train_fixtures_ids.values())))
        mask_test = df["id"].isin(set(chain(*_test_fixtures_ids.values())))
        mask_predict = df["id"].isin(set(chain(*_predict_fixtures_ids.values())))

        df_train = df[mask_train & ~mask_test & ~mask_predict].copy()
        df_test = df[mask_test & ~mask_predict].copy()
        df_predict = df[mask_predict].copy()

        print(f"Train dataset total samples: {len(df_train)}")
        print(f"Test dataset total samples: {len(df_test)}")
        print(f"Predict dataset total samples: {len(df_predict)}")

        # Additional argument checking which can be done only after computing individual datasets.
        # Check if datasets are empty (there is a check during parsing arguments that test/predict
        # split samples must be at least 1, but some dataset may become empty in the end due to some
        # restrictions and filtering).
        emsg = "Maybe try to specify the split of test and/or prediction samples more reasonably?"
        if df_train.empty:
            raise ValueError(f"Train dataset is empty. {emsg}")
        if df_test.empty:
            raise ValueError(f"Test dataset is empty. {emsg}")
        if df_predict.empty:
            raise ValueError(f"Predict dataset is empty. {emsg}")
        # Check if datasets are too large (e.g. predict dataset is larger than the rest, etc.)
        if len(df_predict) > len(df_test) + len(df_train):
            raise ValueError(f"Number of samples in predict dataset is too large. {emsg}")
        if len(df_test) > len(df_train):
            raise ValueError(f"Number of samples in test dataset is too large. {emsg}")

        return df_train, df_test, df_predict


def fit_scalers(df: pd.DataFrame) -> None:
        """
        Fits scalers on columns with the same name.
        Features specified in FEATURES_TO_SCALE are fit on columns with both "home_" and "away_" prefixes.

        :param df: Dataframe on which scalers should be fit.
        """
        for feature, scaler in _scalers.items():
            if feature == "season":
                scaler.fit(df["season"].unique().reshape(-1, 1))
            elif feature in FEATURES_TO_SCALE:
                values = np.concatenate((df[f"home_{feature}"].values, df[f"away_{feature}"].values))
                scaler.fit(np.unique(values).reshape(-1, 1))
            else:
                scaler.fit(df[feature].unique().reshape(-1, 1))

def get_fixtures_ids_from_df(df: pd.DataFrame, team: str) -> List[int]:
    return df[(df["home"] == team) | (df["away"] == team)].loc[:, "id"].tolist()

def _fit_teams_names_lencoder() -> LabelEncoder:
        """
        Fits a LabelEncoder for teams names.
        Queries teams names from db and fits encoder to map the names from strings to integers
        starting from 0 (including empty string for possible unknown/missing team name).

        :return: Fit LabelEncoder.
        """
        df = query_teams_names()
        values = [""] + df["name"].values.tolist()
        return LabelEncoder().fit(values)

def _transform_team_name(team_name: str) -> np.ndarray:
        _teams_names_lenc = _fit_teams_names_lencoder()
        return np.array(list(np.binary_repr(_teams_names_lenc.transform([team_name])[0],
                                            width=len(_teams_names_lenc.classes_).bit_length())),
                        dtype=int)


def _scale_team_data(df: pd.DataFrame) -> pd.DataFrame:
        for f in FEATURES_TO_LENC:
            nonnull_index = ~df[f].isnull()
            df.loc[nonnull_index, f] = df.loc[nonnull_index, f].apply(_transform_team_name)

        for f, scaler in _scalers.items():
            nonnull_index = ~df[f].isnull()
            df.loc[nonnull_index, f] = scaler.transform(df.loc[nonnull_index, f].values.reshape(-1, 1))

            # Also transform columns with "future_" prefix with the same scalers
            futuref = f"future_{f}"
            if futuref in df.columns:
                nonnull_index = ~df[futuref].isnull()
                df.loc[nonnull_index, futuref] = scaler.transform(df.loc[nonnull_index, futuref].values.reshape(-1, 1))

        return df

def load_and_process_team_data(dataset: Dataset, teamid: int, team_fixtures_idx: List[int]) -> pd.DataFrame:
    df = query_team_data(seasons, params=(teamid, teamid, teamid, *seasons))

    # Current win-or-draw target from team's POV
    df["wd"] = np.select(
        condlist=[
            (df["homeTeamID"] == teamid) & (df["winner"] == 1),
            (df["homeTeamID"] == teamid) & (df["winner"] == 2),
            (df["awayTeamID"] == teamid) & (df["winner"] == 2),
            (df["awayTeamID"] == teamid) & (df["winner"] == 1),
            df["winner"] == 0
        ],
        choicelist=[
            1,
            0,
            1,
            0,
            1
        ],
        default=np.nan
    )
    # Odds for wd from team's POV
    df["odds_wd"] = np.select(
        condlist=[
            df["homeTeamID"] == teamid,
            df["awayTeamID"] == teamid
        ],
        choicelist=[
            df["oddsDC_1X"],
            df["oddsDC_X2"]
        ],
        default=np.nan
    )
    # `team` is just name of current team
    df["team"] = np.select(
        condlist=[
            df["homeTeamID"] == teamid,
            df["awayTeamID"] == teamid
        ],
        choicelist=[
            df["home"],
            df["away"]
        ],
        default=np.nan
    )
    # `opponent` is the opposite team to `team`
    df["opponent"] = np.select(
        condlist=[
            df["homeTeamID"] == teamid,
            df["awayTeamID"] == teamid
        ],
        choicelist=[
            df["away"],
            df["home"]
        ],
        default=np.nan
    )
    # Whether the current team plays as home = 1, or away = 0
    df["ashome"] = np.select(
        condlist=[
            df["homeTeamID"] == teamid,
            df["awayTeamID"] == teamid,
        ],
        choicelist=[
            1,
            0
        ],
        default=np.nan
    )
    # Encode league: PL = 1, CH = 0
    df["league"] = np.select(
        condlist=[
            df["league"] == "PL",
            df["league"] == "CH"
        ],
        choicelist=[
            1,
            0
        ],
        default=np.nan
    )

    # Create future values of features which are known in advance.
    # They are created by shifting original values by one row.
    df["future_season"] = df["season"].shift(-1)
    df["future_league"] = df["league"].shift(-1)
    df["future_ashome"] = df["ashome"].shift(-1)
    df["future_opponent"] = df["opponent"].shift(-1)
    df["future_wd"] = df["wd"].shift(-1)
    df["future_odds_wd"] = df["odds_wd"].shift(-1)

    # Dataframe needs to be filtered according to current dataset to contain only corresponding
    # matches. Also index needs to be reset (to correctly access data for test and predict sets).
    df = df.loc[df["id"].isin(team_fixtures_idx)].copy()
    df.reset_index(inplace=True, drop=True)

    # If there are nans for train dataset, use ffill method to fill them.
    # This can happen if team occurs only in train dataset and does not have more data in later
    # seasons, thus it does not matter that we fill these nans with incorrect data as they wo not be
    # used for testing anyway.
    if dataset == Dataset.Train:
        # Fill nan values of future opponent with empty string
        df["future_opponent"].fillna("", inplace=True)
        df.fillna(method="ffill", inplace=True)

    df = _scale_team_data(df)

    return df

def align_fixtures_ids(df: pd.DataFrame,
                       team: str,
                       fixtures_ids: List[int],
                       _timesteps: int) -> List[int]:
    aligned_matches = df[(df["home"] == team) | (df["away"] == team)][-_timesteps:]
    aligned_fixtures_ids = get_fixtures_ids_from_df(aligned_matches, team)

    return aligned_fixtures_ids + fixtures_ids


##### 神经网络

In [None]:
import numpy as np
from typing import Any, Dict, List, Tuple

import tensorflow as tf

import keras.backend as K
from keras.initializers import glorot_uniform
from keras.layers import Input, concatenate, Dense, LSTM, Layer
from keras.losses import SparseCategoricalCrossentropy
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2

class SPNetwork:

    def __init__(self, team_name: str, target_team: bool, lenc_bitlen: int) -> None:
        """

        :param team_name: Team's name the network is created for.
        :param target_team: Whether the team is in test set.
        :param session: Current session used.
        :param lenc_bitlen: Bitlength needed to encode all teams names.
        """
        self._lenc_bitlen = lenc_bitlen
        self._features = FEATURES_COMMON + FEATURES_WD
        self._team_name = team_name
        self._model = None
        self._seed = _seed

        # Store names of main head and head2 layers for direct access to layers
        self._main_head_layers_names = []
        self._main_head_stateful_layers_names = []
        self._head2_layers_names = []
        self._head2_stateful_layers_names = []

        # Dropout and lr can differ for teams' models which are not used in testing
        if target_team:
            self._lr = LR
            self._dropout = DROPOUT
        else:
            self._lr = NONTEST_LR
            self._dropout = NONTEST_DROPOUT

    def build(self) -> None:
        self._model = self._assemble_network()


    def _assemble_network(self) -> Model:
        head1_inputs = []
        head2_inputs = []

        for f in self._features:
            if f in FEATURES_TO_LENC:
                head1_inputs.append(Input(batch_shape=(BATCH_SIZE, None, self._lenc_bitlen),
                                          name=f"input_team1_{f}"))
                head2_inputs.append(Input(batch_shape=(BATCH_SIZE, None, self._lenc_bitlen),
                                          name=f"input_team2_{f}"))
            else:
                head1_inputs.append(Input(batch_shape=(BATCH_SIZE, None, 1), name=f"input_team1_{f}"))
                head2_inputs.append(Input(batch_shape=(BATCH_SIZE, None, 1), name=f"input_team2_{f}"))

        # Main head
        head1_input_concat = concatenate(inputs=head1_inputs, name="head1_input_concat")
        head1_rnn1 = LSTM(UNITS,
                          dropout=self._dropout,
                          recurrent_dropout=0.5,
                          stateful=STATEFUL,
                          return_sequences=False,
                          kernel_regularizer=l2(0.01),
                          kernel_initializer=glorot_uniform(self._seed),
                          name="head1_rnn1")(head1_input_concat)
        head1_fc1 = Dense(15,
                          activation="elu",
                          kernel_regularizer=l2(0.01),
                          kernel_initializer=glorot_uniform(self._seed),
                          name="head1_fc1")(head1_rnn1)

        # Head2
        head2_input_concat = concatenate(inputs=head2_inputs, name="head2_input_concat")
        head2_rnn1 = LSTM(UNITS,
                          dropout=self._dropout,
                          recurrent_dropout=0.5,
                          stateful=STATEFUL,
                          return_sequences=False,
                          kernel_regularizer=l2(0.01),
                          kernel_initializer=glorot_uniform(self._seed),
                          name="head2_rnn1")(head2_input_concat)
        head2_fc1 = Dense(15,
                          activation="elu",
                          kernel_regularizer=l2(0.01),
                          kernel_initializer=glorot_uniform(self._seed),
                          name="head2_fc1")(head2_rnn1)

        joint_concat = concatenate([head1_fc1, head2_fc1], name="joint_concat")
        output = Dense(2,
                       activation="softmax",
                       kernel_initializer=glorot_uniform(self._seed),
                       name="output")(joint_concat)

        model = Model(inputs=head1_inputs+head2_inputs,
                      outputs=output,
                      name=self._team_name)

        model.compile(optimizer=Adam(learning_rate=self._lr, clipvalue=0.5, epsilon=1e-7),
                      loss=SparseCategoricalCrossentropy(),
                      metrics=["acc"])

        return model

    def train_on_batch(self,
                       x_input: Dict[str, np.ndarray],
                       y_input: Dict[str, np.ndarray],
                       ) -> Tuple[np.ndarray, np.ndarray]:

        loss, acc = self._model.train_on_batch(x_input, y_input)
        return loss, acc

    def test_on_batch(self,
                      x_input: Dict[str, np.ndarray],
                      y_input: Dict[str, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
    
        loss, acc = self._model.test_on_batch(x_input, y_input)
        return loss, acc

    def predict_on_batch(self, x_input: Dict[str, np.ndarray]) -> np.ndarray:
        preds = self._model.predict_on_batch(x_input)
            
        return preds.flatten()

    def decay_learning_rate(self) -> None:
        current_lr = K.get_value(self._model.optimizer.lr)
        K.set_value(self._model.optimizer.lr, current_lr * self._lrdecay)




##### teams Model

In [None]:
from util.enums import TargetVariable


class SPModel:

    def __init__(self, team_name: str, test_teams: List[str], lenc_bitlen: int) -> None:

        self._team_name = team_name
        self._lenc_bitlen = lenc_bitlen
        self._target_variable = TargetVariable.FutureWD
        self.__timesteps = _timesteps
        self._features = FEATURES_COMMON + FEATURES_WD
        self._target_team = team_name in test_teams

        self.matches_data = {d: {"idx": 0, "data": {}} for d in Dataset}

        tf.compat.v1.experimental.output_all_intermediates(True)

        self.network = SPNetwork(self._team_name, self._target_team, lenc_bitlen)

    def build_model(self) -> None:
        self.network.build()

    def train_on_batch(self,
                       x_input: Dict[str, np.ndarray],
                       y_input: Dict[str, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:

        return self.network.train_on_batch(x_input, y_input)

    def test_on_batch(self,
                      x_input: Dict[str, np.ndarray],
                      y_input: Dict[str, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:

        return self.network.test_on_batch(x_input, y_input)

    def predict_on_batch(self, x_input: Dict[str, np.ndarray]) -> np.ndarray:
        return self.network.predict_on_batch(x_input)

    def warm_up(self) -> None:
        x_input, y_input = self.form_input(Dataset.Train, team2_model=self)
        self.network.train_on_batch(x_input, y_input, self.class_weights)


    def prepare_matches_data(self, dataset: Dataset, matches_data: pd.DataFrame) -> None:
        i = 0

        while True:
            iend = i + self.__timesteps - 1
            subset = matches_data.loc[i:iend]
            if subset.empty:
                break

            # Default x, y values are none.
            # If model encounters none values during training it will skip them
            x_input = None
            y_input = None

            # Consider only chunks which length is equal to number of _timesteps and which are not the
            # last chunk of data in the dataset (the last chunk is skipped to properly align match sequences)
            if len(subset) == self.__timesteps and len(matches_data.loc[i:iend+1]) != self.__timesteps:
                x_input = {}

                # Reshape features
                for f in self._features:
                    # Teams names are stored as lists so they need to be stacked
                    if f in FEATURES_TO_LENC:
                        team1_names = np.vstack(subset.loc[:, f].values).reshape((1, -1, self._lenc_bitlen))
                        x_input[f"input_team1_{f}"] = team1_names
                    else:
                        x_input[f"input_team1_{f}"] = subset.loc[:, f].values.reshape(1, -1, 1)

                # Get target variable from last row and ignore it if it is none
                y = subset.loc[iend, self._target_variable.value]
                if y is not None and not np.isnan(y):
                    y_input = {"output": y.reshape(-1, 1)}

            self.matches_data[dataset]["data"][i] = {"x_input": x_input, "y_input": y_input}
            i += 1

    def form_input(self,
                   dataset: Dataset,
                   team2_model: "SPModel") -> Tuple[Optional[Dict[str, np.array]],
                                                    Optional[Dict[str, np.array]]]:
        # Get current data chunk based on index position for both models
        i = self.matches_data[dataset]["idx"]
        if i >= len(self.matches_data[dataset]["data"]):
            return None, None
        d1 = self.matches_data[dataset]["data"][i]
        j = team2_model.matches_data[dataset]["idx"]
        if j >= len(team2_model.matches_data[dataset]["data"]):
            return None, None
        d2 = team2_model.matches_data[dataset]["data"][j]

        x_input = None
        d2_as_team2 = {}

        if d1["x_input"] and d2["x_input"]:
            for k, v in d2["x_input"].items():
                d2_as_team2[k.replace("team1", "team2")] = v

            # Unpack both inputs into a single dict
            x_input = {**d1["x_input"], **d2_as_team2}

        return x_input, d1["y_input"]


In [None]:
# models
models: Dict[str, SPModel] = {}
_teams_tuples = query_teams_ids_names_tuples()

train_fixtures_ids = {}
test_fixtures_ids = {}
predict_fixtures_ids = {}
train_teams = []
test_teams = []
predict_teams = []

#### 加载数据

In [None]:
# 查询fixtures数据
fixturesData = query_fixtures_data(seasons)

allTeams = get_unique_teams(fixturesData)
last_season_teams = get_last_season_unique_teams(fixturesData)
teams_fixtures_ids = {t: fixturesData[(fixturesData["home"] == t) | (fixturesData["away"] == t)].loc[:, "id"].tolist() for t in allTeams}

# 检查缺失列
_check_missing_columns(fixturesData)

# 检查缺失值
_check_nan_values(fixturesData, teams_fixtures_ids)

# 划分训练、测试、预测fixtures id
for t in last_season_teams:
    predict_fixtures_ids[t] = teams_fixtures_ids[t][-NPREDICT:]
    teams_fixtures_ids[t] = teams_fixtures_ids[t][:-NPREDICT]

    test_fixtures_ids[t] = teams_fixtures_ids[t][-NTEST:]
    teams_fixtures_ids[t] = teams_fixtures_ids[t][:-NTEST]

train_fixtures_ids = teams_fixtures_ids

# 划分训练集，测试集，预测集
df_train, df_test, df_predict = _mask_out_dataset(fixturesData, train_fixtures_ids, test_fixtures_ids, predict_fixtures_ids)

train_teams = get_unique_teams(df_train)
test_teams = get_unique_teams(df_test)
predict_teams = get_unique_teams(df_predict)

In [None]:
# 数据标准化
fit_scalers(df_train)

#### 数据处理

In [None]:
def _fit_teams_names_lencoder() -> LabelEncoder:
        df = query_teams_names()
        values = [""] + df["name"].values.tolist()
        return LabelEncoder().fit(values)  

_teams_names_lenc = _fit_teams_names_lencoder()
teams_names_bitlen = len(_teams_names_lenc.classes_).bit_length()

for t in allTeams:
    models[t] = SPModel(t, test_teams, teams_names_bitlen)


In [None]:
# 为每只球队准备数据
for t in train_teams:
    fixtures_ids = get_fixtures_ids_from_df(df_train, t)
    team_matches_data = load_and_process_team_data(Dataset.Train, _teams_tuples[t], fixtures_ids)
    models[t].prepare_matches_data(Dataset.Train, team_matches_data)

for t in test_teams:
    fixtures_ids = get_fixtures_ids_from_df(df_test, t)
    aligned_fixtures_ids = align_fixtures_ids(df_train, t, fixtures_ids, _timesteps)
    team_matches_data = load_and_process_team_data(Dataset.Test, _teams_tuples[t], aligned_fixtures_ids)
    models[t].prepare_matches_data(Dataset.Test, team_matches_data)


for t in predict_teams:
    combined_df_train = pd.concat((df_train, df_test), ignore_index=True)
    fixtures_ids = get_fixtures_ids_from_df(df_predict, t)
    # Use combined train+test dataset in case that there would be less test samples than _timesteps
    # so the rest of sequence can be filled from train dataset
    aligned_fixtures_ids = align_fixtures_ids(combined_df_train, t, fixtures_ids, _timesteps)
    team_matches_data = load_and_process_team_data(Dataset.Predict, _teams_tuples[t], aligned_fixtures_ids)
    models[t].prepare_matches_data(Dataset.Predict, team_matches_data)    



#### 构建模型

In [None]:
# Assemble network for each model
print(f"Assembling {len(models)} models...")

for t in allTeams:
    models[t].build_model()

# Reset indices of dfs
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_predict.reset_index(inplace=True, drop=True)

#### 训练

In [None]:
test_x_inputs = []
test_y_inputs = []
metics = []

for epoch in range(1, _epochs):
    for t in train_teams:
        test_loss = []
        test_acc = []
        x = 0
        for i, r in df_train[(df_train["home"]==t) | (df_train["away"]==t)].iterrows():
            team1 = r["home"]

            if (_timesteps + x) >= len(df_train[(df_train["home"]==team1) | (df_train["away"]==team1)]):
                x += 1
                continue

            # 查询这个数据集target对应的team2
            targetFixture = df_train[(df_train["home"]== team1) | (df_train["away"]== team1)].iloc[_timesteps + x]
            if targetFixture.loc["home"] == team1:
                team2 = targetFixture.loc["away"]
            else:
                team2 = targetFixture.loc["home"]


            # Train home model
            x_input, y_input = models[team1].form_input(Dataset.Train, models[team2])
            loss, acc = models[team1].train_on_batch(x_input, y_input)

            test_loss.append(loss)
            test_acc.append(acc)

        metics.append({"team": t, "loss": test_loss, "acc": test_acc})    