In [120]:
# 导入需要的包
import pandas as pd ##解决数据分析任务
import math  ## python 标准库 包含数学公式
import csv ## python 对csv文件的操作
import random ## 用于生成随机数
import numpy as np  ##支持大量的维度数组和矩阵运算
from sklearn import linear_model ##线性模型
from sklearn.model_selection import cross_val_score ## 交叉验证
from collections import defaultdict ## 创建字典
from sklearn.preprocessing import LabelEncoder, OneHotEncoder ## 对分类型特征值进行编码 每一个分类特征变量的m个可能的取值转变成m个二值特征
from sklearn.ensemble import RandomForestClassifier ## 随机森林分类器
from sklearn.model_selection  import GridSearchCV ##  参数优化  网格搜索
from sklearn.model_selection import train_test_split ## 训练集测试集
from matplotlib.colors import ListedColormap
from sklearn.metrics import classification_report  
## 初始加载csv球队数据
def initialize_data(Mstat, Ostat, Tstat):
    # 综合统计数据表  RK:排名   Arena: 球馆  按列删除 RK和Arena
    new_Mstat = Mstat.drop(['Rk', 'Arena'], axis=1)  
    # 所遇到的对手平均每场比赛的统计信息表  G：比赛场数  MP：平均每场比赛进行的时间
    new_Ostat = Ostat.drop(['Rk', 'G', 'MP'], axis=1)
    # 每支队伍平均每场比赛的表现统计 
    new_Tstat = Tstat.drop(['Rk', 'G', 'MP'], axis=1)
    # 采用 左连接  以 Team 作为连接的列索引名称  将三个表合并
    team_stats1 = pd.merge(new_Mstat, new_Ostat, how='left', on='Team')
    team_stats1 = pd.merge(team_stats1, new_Tstat, how='left', on='Team')
    # 设置单索引并返回表
    return team_stats1.set_index('Team', inplace=False, drop=True)
#初始化球队信息
def initial(results):
    results.columns = ["Date", "Visitor Team", "VisitorPts", "Home Team", "HomePts","Score Type","OT?", "Notes"]
    results=results.drop(['Date', 'Score Type',"OT?","Notes"], axis=1)
    results["WLoc"]="H"
    results["HomeWin"] = 0
    for index, row in results.iterrows():    
        row["HomeWin"] = 1 if row["VisitorPts"] < row["HomePts"]  else 0
        results.loc[index]=row    
    for index, row in results.iterrows():    
        row["WLoc"] = "H" if row["HomeWin"]  else "L"
        results.loc[index]=row
    y_true = results["HomeWin"].values
    results["HomeLastWin"] = 0
    results["VisitorLastWin"] = 0
    results["HomeTeamWonLast"] = 0
    results["HomeTeamRanksHigher"] = 0
    results["Elo"] = 0
    results["Home Elo"] = 0
    results["Visitor Elo"] = 0
    return results
##  获取球队等级分数
def get_elo(team):
    try:
        return team_elos[team]
    except:
        # 当最初没有elo时，给每个队伍最初赋base_elo
        team_elos[team] = base_elo
        return team_elos[team]
    
# 计算每个球队的elo值
def calc_elo(win_team, lose_team):    
    winner_rank = get_elo(win_team)
    loser_rank = get_elo(lose_team)
    # 胜利球队 对 失败球队的 胜率期望
    rank_diff = winner_rank - loser_rank    
    exp = (rank_diff  * -1) / 400
    odds = 1 / (1 + math.pow(10, exp))    
    # 根据rank级别修改K值
    if winner_rank < 2100:
        k = 32
    elif winner_rank >= 2100 and winner_rank < 2400:
        k = 24
    else:
        k = 16
    # 更新 rank 数值
    new_winner_rank = round(winner_rank + (k * (1 - odds)))      
    new_loser_rank = round(loser_rank + (k * (0 - odds)))    
    return new_winner_rank, new_loser_rank
# 初始化主场胜利球队数据
def  build_dataSet(all_data):
    #print("Building data set..")
    X = []
    y = []
    skip = 0
    for index, row in all_data.iterrows():
        Wteam = row["Visitor Team"] if row["VisitorPts"] > row["HomePts"] else row["Home Team"]
        Lteam = row['Visitor Team'] if row["VisitorPts"] < row["HomePts"] else row["Home Team"]
        #获取最初的elo或是每个队伍最初的elo值
        team1_elo = get_elo(Wteam)
        team2_elo = get_elo(Lteam)
        # 给主场比赛的队伍加上100的elo值
        if row['WLoc'] == 'H':
            team1_elo += 100
        else:
            team2_elo += 100
        # 把elo当为评价每个队伍的第一个特征值
        team1_features = [team1_elo]
        team2_features = [team2_elo]
        # 添加我们从basketball reference.com获得的每个队伍的统计信息
        for key, value in team_stats.loc[Wteam].iteritems():
            team1_features.append(value)
        for key, value in team_stats.loc[Lteam].iteritems():
            team2_features.append(value)        
        team1_features.append(row["HomeLastWin"])
        team2_features.append(row["VisitorLastWin"])
        team1_features.append(row["HomeTeamWonLast"])
        team2_features.append(not row["HomeTeamWonLast"])
        team1_features.append(row["HomeTeamRanksHigher"])
        team2_features.append(not row["HomeTeamRanksHigher"])
        # 将两支队伍的特征值随机的分配在每场比赛数据的左右两侧
        # 并将对应的0/1赋给y值
        if random.random() > 0.5:
            X.append(team1_features + team2_features)
            y.append(0)
        else:
            X.append(team2_features + team1_features)
            y.append(1)
        if skip == 0:
           # print('X',X)
            skip = 1
        # 根据这场比赛的数据更新队伍的elo值
        new_winner_rank, new_loser_rank = calc_elo(Wteam, Lteam)  
        team_elos[Wteam] = new_winner_rank
        team_elos[Lteam] = new_loser_rank        
    return np.nan_to_num(X), y
def cal_team(results):
    ladder_filename = "data/123.csv"
    ladder = pd.read_csv(ladder_filename)
    #遍历每一行，查找主场队和客场队两支球队的战绩
    for index, row in results.iterrows():
        home_team = row["Home Team"]
        visitor_team = row["Visitor Team"]
        home_rank = ladder[ladder["Team"] == home_team]["Rk"].values[0]
        visitor_rank = ladder[ladder["Team"] == visitor_team]["Rk"].values[0]
        row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
        results.loc[index] = row
     #字典的键为球队，值为是否赢得上一场比赛
    won_last = defaultdict(int)
    #遍历所有行，在此过程中，更新每一行，为其增加两个特征值：两支球队在上场比赛有没有获胜。
    for index, row in results.iterrows():  # Note that this is not efficient
        home_team = row["Home Team"]
        visitor_team = row["Visitor Team"]
        row["HomeLastWin"] = 1 if won_last[home_team] else 0
        row["VisitorLastWin"] = 1 if won_last[visitor_team] else 0
        results.loc[index] = row    
        # Set current win
        won_last[home_team] = row["HomeWin"]
        won_last[visitor_team] = not row["HomeWin"]
      #字典的键为球队，值为主场球队是否赢得与客场球队的上一场比赛    
    last_match_winner = defaultdict(int)
    for index, row in results.iterrows():
        home_team = row["Home Team"]
        visitor_team = row["Visitor Team"]
        teams = tuple(sorted([home_team, visitor_team]))     
        row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == row["Home Team"] else 0
        results.loc[index] = row    
        winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
        last_match_winner[teams] = winner 
    
    #elo值
    for index, row in results.iterrows():
        Wteam = row["Visitor Team"] if row["VisitorPts"] > row["HomePts"] else row["Home Team"]
        Lteam = row['Visitor Team'] if row["VisitorPts"] < row["HomePts"] else row["Home Team"]
        new_winner_rank, new_loser_rank = calc_elo(Wteam, Lteam)
        team_elos[Wteam] = new_winner_rank
        team_elos[Lteam] = new_loser_rank        
        row["Home Elo"] = new_winner_rank if row["HomeWin"] else new_loser_rank
        row["Visitor Elo"] = new_loser_rank if row["HomeWin"] else new_winner_rank
        results.loc[index] = row 
        
    for index, row in results.iterrows():
        Wteam = row["Visitor Team"] if row["VisitorPts"] > row["HomePts"] else row["Home Team"]
        Lteam = row['Visitor Team'] if row["VisitorPts"] < row["HomePts"] else row["Home Team"]
        new_winner_rank, new_loser_rank = calc_elo(Wteam, Lteam)
        team_elos[Wteam] = new_winner_rank
        team_elos[Lteam] = new_loser_rank
        row["Elo"] = 1 if (new_winner_rank-new_loser_rank)>0 else 0
        results.loc[index] = row  
    return results
# 预测球队胜利概率
def predict_winner(team_1, team_2, model):    
        features = []
        team1_elo = get_elo(team_1)
        team2_elo = get_elo(team_2)+100        
        team1_features = [team1_elo]        
        for key, value in team_stats.loc[team_1].iteritems():
            team1_features.append(value)
        team2_features = [team2_elo]
        for key, value in team_stats.loc[team_2].iteritems():
            team2_features.append(value)        
        team1_features.append(1)
        team2_features.append(0)
        team1_features.append(0)
        team2_features.append(1)
        team1_features.append(0)
        team2_features.append(1)        
        features = np.nan_to_num(team1_features+team2_features)
        #  predict_prob返回的是对于测试样本中跟不同类别的匹配程
        # （概率，Probability estimates）, 对于 N 个类别，predict_prob返回的则是一个 N 维数组，
        #  按照类别顺序对应这个测试样本分别属于这些类别的可能性。
        return model.predict_proba([features])
    
    
# 初始化参数
# 当每支球队没有elo等级分时，赋予其基础elo等级分
base_elo = 1600  # 初始等级分数
team_elos = {}   # 球队等级分
team_stats = {}  # 球队数据
results={}
folder = 'data' #存放数据的目录
Mstat = pd.read_csv(folder + '/17-18Miscellaneous_Stat.csv')
Ostat = pd.read_csv(folder + '/17-18Opponent_Per_Game_Stat.csv')
Tstat = pd.read_csv(folder + '/17-18Team_Per_Game_Stat.csv')
results = pd.read_csv(folder + '/2018data.csv')
team_stats = initialize_data(Mstat, Ostat, Tstat)
results=initial(results)
results=cal_team(results)

<font color=red size=5>1.回归预测

In [125]:
X, y = build_dataSet(results)
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.3, random_state=14, stratify=y_true)
model = linear_model.LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
predictd=cross_val_score(model, X_test, y_test, cv =10,scoring='accuracy',n_jobs=-1).mean()
print("+++最佳效果: {0:.1f}%".format(predictd * 100))

+++最佳效果: 69.3%


<font color=red size=5>2.随机森林预测

In [4]:
X_home_higher =  results[["HomeTeamWonLast","Elo","HomeLastWin"]].values
encoding = LabelEncoder()
y_true = results["HomeWin"].values
encoding.fit(results["Home Team"].values)
home_teams = encoding.transform(results["Home Team"].values)
visitor_teams = encoding.transform(results["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T
X_all = np.hstack([X_home_higher, X_teams]) 
parameter_space = {                   
  'n_estimators': [100,150,200,250],
  'criterion': ["gini", "entropy"],                   
  'max_depth': (130,150, 155, 160,),   
  'min_samples_split': (1.0, 2, 3),  
  'min_samples_leaf': (2,4,6,8),
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space,n_jobs=-1, scoring='f1',cv=4)
X_train, X_test, y_train, y_test = train_test_split(
    X_all,y_true, test_size=0.3, random_state=14, stratify=y_true)
grid.fit(X_train, y_train)
print('最佳效果：{0:.1f}%'.format(grid.best_score_ * 100))
print('最优参数：')
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameter_space.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))
predictions = grid.predict(X_test)  
print (classification_report(y_test, predictions))  

最佳效果：78.4%
最优参数：
	criterion: 'entropy'
	max_depth: 130
	min_samples_leaf: 2
	min_samples_split: 2
	n_estimators: 150
              precision    recall  f1-score   support

           0       0.70      0.58      0.63       163
           1       0.73      0.83      0.78       231

   micro avg       0.72      0.72      0.72       394
   macro avg       0.72      0.70      0.71       394
weighted avg       0.72      0.72      0.72       394



In [5]:
schedule = pd.read_csv(folder + '/2018-2019Schedule.csv')
result = []
for index, row in schedule.iterrows():
    team1 = row['Vteam']
    team2 = row['Hteam']
    pred = predict_winner(team1, team2, model)
    #print(pred)
    prob = pred[0][0]
    if prob > 0.5:
        winner = team1
        loser = team2
        result.append([winner, loser, prob])
    else:
        winner = team2
        loser = team1
        result.append([winner, loser, 1 - prob])
with open('2018-2019_result.csv', 'w',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['win', 'lose', 'probability'])
        writer.writerows(result)
        #print('+++done.')
pd.read_csv('2018-2019_result.csv',header=0)

Unnamed: 0,win,lose,probability
0,Boston Celtics,Philadelphia 76ers,0.522810
1,Golden State Warriors,Oklahoma City Thunder,0.511691
2,Milwaukee Bucks,Charlotte Hornets,0.736754
3,Detroit Pistons,Brooklyn Nets,0.684764
4,Houston Rockets,New Orleans Pelicans,0.601306
5,Indiana Pacers,Memphis Grizzlies,0.864983
6,Denver Nuggets,Los Angeles Clippers,0.723386
7,New York Knicks,Atlanta Hawks,0.667207
8,Miami Heat,Orlando Magic,0.806368
9,Dallas Mavericks,Phoenix Suns,0.687256
