## Scraping Last 7 Games (%) and Win/Lose Streaks
Source : https://www.baseball-reference.com/boxes/


In [248]:
import pandas as pd

In [249]:
game_logs = pd.read_csv("../data/game_logs_2024.csv")

In [250]:
games["game_datetime"] = pd.to_datetime(games["game_datetime"])

Retreiving game results from the <b> Original </b> dataset, dividing each match by home and away game

In [252]:
games["home_win"] = (games["home_score"] > games["away_score"]).astype(int)
games["away_win"] = (games["away_score"] > games["home_score"]).astype(int)

home_df = games[["game_id", "game_datetime", "home_name", "home_win"]].rename(
    columns={"home_name": "team", "home_win": "result", "game_datetime": "date"}
)
away_df = games[["game_id", "game_datetime", "away_name", "away_win"]].rename(
    columns={"away_name": "team", "away_win": "result", "game_datetime": "date"}
)

df = pd.concat([home_df, away_df], ignore_index=True)

In [253]:
df = pd.concat([home_df, away_df], ignore_index=True)

In [254]:
df = df.sort_values(["team", "date"]).reset_index(drop=True)

Rolling up the 7 matches to calculate the "Last 7 win percentage"

In [256]:
df["last_7win_pct"] = None
for team, group in df.groupby("team"):
    shifted = group["result"].shift()
    win_pct = shifted.rolling(window=7).mean()
    df.loc[group.index, "last_7win_pct"] = win_pct

In [257]:
df.head(20)

Unnamed: 0,game_id,date,team,result,last_7win_pct
0,747298,2024-07-17 00:00:00+00:00,American League All-Stars,1,
1,748264,2024-02-23 20:10:00+00:00,Arizona Diamondbacks,0,
2,748250,2024-02-24 20:10:00+00:00,Arizona Diamondbacks,1,
3,748229,2024-02-25 20:10:00+00:00,Arizona Diamondbacks,1,
4,748219,2024-02-26 20:05:00+00:00,Arizona Diamondbacks,0,
5,748200,2024-02-27 20:10:00+00:00,Arizona Diamondbacks,0,
6,748188,2024-02-28 20:05:00+00:00,Arizona Diamondbacks,1,
7,748168,2024-02-29 20:10:00+00:00,Arizona Diamondbacks,1,
8,748156,2024-03-02 01:05:00+00:00,Arizona Diamondbacks,1,0.571429
9,748142,2024-03-02 20:10:00+00:00,Arizona Diamondbacks,0,0.714286


### Retrieving the Streak Part

7 win streak : +7 <br/>
5 lose streak : -5

In [259]:
def compute_streak(results):
    streaks = []
    streak = 0
    for result in results:
        if result == 1: 
            streak = streak + 1 if streak > 0 else 1
        else:
            streak = streak - 1 if streak < 0 else -1
        streaks.append(streak)
    return streaks

In [260]:
df["streak"] = None
for team,group in df.groupby("team"):
    streaks = compute_streak(group["result"].tolist())
    df.loc[group.index, "streak"] = streaks

In [261]:
df

Unnamed: 0,game_id,date,team,result,last_7win_pct,streak
0,747298,2024-07-17 00:00:00+00:00,American League All-Stars,1,,1
1,748264,2024-02-23 20:10:00+00:00,Arizona Diamondbacks,0,,-1
2,748250,2024-02-24 20:10:00+00:00,Arizona Diamondbacks,1,,1
3,748229,2024-02-25 20:10:00+00:00,Arizona Diamondbacks,1,,2
4,748219,2024-02-26 20:05:00+00:00,Arizona Diamondbacks,0,,-1
...,...,...,...,...,...,...
5991,744796,2024-09-26 17:05:00+00:00,Washington Nationals,0,0.142857,-4
5992,744797,2024-09-27 22:45:00+00:00,Washington Nationals,1,0.142857,1
5993,744799,2024-09-28 20:05:00+00:00,Washington Nationals,1,0.285714,2
5994,744798,2024-09-29 19:05:00+00:00,Washington Nationals,0,0.428571,-1


In [288]:
df = df.drop_duplicates(subset=["game_id", "team"])

home = df.rename(columns={
    "last_7win_pct": "home_last7_win_pct",
    "streak": "home_streak"})[["game_id", "team", "home_last7_win_pct", "home_streak"]]

away = df.rename(columns={
    "last_7win_pct": "away_last7_win_pct",
    "streak": "away_streak"})[["game_id", "team", "away_last7_win_pct", "away_streak"]]


In [290]:
final_df = pd.read_csv("../data/final_game_logs.csv")

In [292]:
df["game_id"].value_counts().value_counts()

count
2    2959
Name: count, dtype: int64

In [294]:
final_df = final_df.merge(
    home, 
    left_on=["game_id", "home_name"], 
    right_on=["game_id", "team"], 
    how="left"
).drop(columns=["team"])

final_df = final_df.merge(
    away, 
    left_on=["game_id", "away_name"], 
    right_on=["game_id", "team"], 
    how="left"
).drop(columns=["team"])

In [296]:
final_df

Unnamed: 0.1,Unnamed: 0,game_id,home_name,away_name,home_probable_pitcher,away_probable_pitcher,away_score,home_score,home_ERA,home_ER,...,wind,condition_code,home_lineup_score,away_lineup_score,home_n_starters,away_n_starters,home_last7_win_pct,home_streak,away_last7_win_pct,away_streak
0,0,748266,San Diego Padres,Los Angeles Dodgers,Joe Musgrove,Gavin Stone,14,1,3.88,43.0,...,3.0,2,0.614000,0.833250,19,20,,-1,,1
1,1,748344,Boston Red Sox,Northeastern Huskies,Helcris Olivárez,Aiven Cabral,2,7,,,...,16.0,0,0.951000,0.453722,17,18,,1,,-1
2,2,748263,Texas Rangers,Kansas City Royals,Dane Dunning,Daniel Lynch IV,4,5,5.31,56.0,...,9.0,0,0.648111,0.583333,18,18,,1,,-1
3,3,748341,Chicago Cubs,Chicago White Sox,Jordan Wicks,Jesse Chavez,1,8,5.48,28.0,...,6.0,3,0.666556,0.287000,18,18,,1,,-1
4,4,748260,Los Angeles Dodgers,San Diego Padres,Landon Knack,Jhony Brito,1,4,3.65,28.0,...,5.0,3,0.962647,0.271895,17,19,,2,,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,3075,775300,Los Angeles Dodgers,New York Yankees,Jack Flaherty,Gerrit Cole,3,6,3.17,57.0,...,5.0,1,0.764100,0.692273,10,11,0.714286,2,0.857143,-1
3640,3076,775294,Los Angeles Dodgers,New York Yankees,Yoshinobu Yamamoto,Carlos Rodón,2,4,3.00,30.0,...,5.0,0,0.754400,0.733273,10,11,0.714286,3,0.714286,-2
3641,3077,775298,New York Yankees,Los Angeles Dodgers,Clarke Schmidt,Walker Buehler,4,2,2.85,27.0,...,5.0,1,0.693500,0.763900,10,10,0.571429,-3,0.714286,4
3642,3078,775297,New York Yankees,Los Angeles Dodgers,Luis Gil,Ben Casparius,4,11,3.50,59.0,...,10.0,3,0.743667,0.757300,9,10,0.428571,1,0.857143,-1


In [298]:
final_df.to_csv("../data/final_game_logs.csv")