In [11]:
import numpy as np
import pandas as pd
import os
import logging

In [12]:
logging.basicConfig(
    filename="data_processing_local.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

In [13]:
logging.info("Reading cbb22.csv.")
df_22 = pd.read_csv("../Final_Project_DE/archive/cbb22.csv")

logging.info("Renamed column 'EFGD_D' to 'EFG_D' in cbb22.csv.")
df_22.rename(columns={"EFGD_D": "EFG_D"}, inplace=True)

logging.info("Saved updated cbb22.csv.")
df_22.to_csv("../Final_Project_DE/archive/cbb22.csv", index=False)

In [14]:
logging.info("Reading cbb16.csv and cbb.csv.")
df_16 = pd.read_csv("../Final_Project_DE/archive/cbb16.csv")
df_all = pd.read_csv("../Final_Project_DE/archive/cbb.csv")

logging.info("Filtering rows for the year 2016 in cbb.csv.")
df_all_16 = df_all[df_all["YEAR"] == 2016]

logging.info(
    "Merging cbb16.csv with filtered cbb.csv for 'POSTSEASON' and 'SEED' columns."
)
df_16 = pd.merge(
    df_16, df_all_16[["TEAM", "POSTSEASON", "SEED"]], on="TEAM", how="left"
)

logging.info("Combined and cleaned 'POSTSEASON' and 'SEED' columns in cbb16.csv.")
df_16["POSTSEASON"] = df_16["POSTSEASON_x"].combine_first(df_16["POSTSEASON_y"])
df_16["SEED"] = df_16["SEED_x"].combine_first(df_16["SEED_y"])
# Drop unnecessary columns created during the merge
df_16.drop(columns=["POSTSEASON_x", "POSTSEASON_y", "SEED_x", "SEED_y"], inplace=True)

logging.info("Saved updated cbb16.csv.")
df_16.to_csv("../Final_Project_DE/archive/cbb16.csv", index=False)

In [15]:
df_16[df_16["POSTSEASON"].notna()].sort_values("SEED")

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED
196,North Carolina,ACC,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,32.3,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0
132,Kansas,B12,37,32,120.9,90.4,0.9662,55.7,45.1,17.8,...,38.6,37.3,52.7,43.4,41.3,32.5,70.1,11.6,E8,1.0
220,Oregon,P12,37,30,118.4,96.2,0.9163,52.3,48.9,16.1,...,40.3,32.0,52.6,46.1,34.4,36.2,69.0,6.7,E8,1.0
328,Virginia,ACC,37,29,119.9,91.0,0.9600,54.8,48.4,15.1,...,32.1,33.4,52.6,46.3,40.3,34.7,61.9,8.6,E8,1.0
216,Oklahoma,B12,37,29,118.2,94.1,0.9326,54.3,47.2,18.3,...,33.0,28.3,48.2,45.3,42.2,33.7,70.8,8.0,F4,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,Fairleigh Dickinson,NEC,32,17,103.0,115.6,0.2090,51.2,52.4,17.7,...,33.3,46.9,49.5,52.1,36.4,35.4,72.5,-9.9,R68,16.0
18,Austin Peay,OVC,34,16,103.4,109.1,0.3501,51.3,53.8,20.2,...,47.0,31.3,51.0,53.3,34.7,36.4,70.5,-11.3,R64,16.0
105,Hampton,MEAC,31,20,98.2,107.5,0.2615,46.7,48.6,18.7,...,41.3,35.7,47.3,47.5,30.5,33.8,71.9,-6.3,R64,16.0
111,Holy Cross,Pat,35,15,96.7,106.9,0.2398,47.9,53.2,16.8,...,36.1,33.4,47.2,52.8,32.6,35.7,64.6,-14.5,R64,16.0


In [16]:
input_folder = "../Final_Project_DE/archive"
output_file = "../Final_Project_DE/combined_cbb.csv"

csv_files = [
    os.path.join(input_folder, file)
    for file in os.listdir(input_folder)
    if file.endswith(".csv") and file not in ["cbb.csv", "cbb24.csv", "cbb20.csv"]
]

dataframes = []

for csv_file in csv_files:
    try:
        logging.info(f"Reading {csv_file}.")
        df = pd.read_csv(csv_file)
        dataframes.append(df)
    except Exception as e:
        logging.error(f"Error reading {csv_file}: {e}")

combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.to_csv(output_file, index=False)

In [17]:
combined_df[combined_df["POSTSEASON"].notna()]

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED
0,Michigan,B10,24,20,118.1,91.1,0.9521,54.9,44.9,16.3,...,28.9,24.5,53.3,42.3,38.7,33.5,66.9,7.2,E8,1.0
1,Baylor,B12,24,22,123.2,94.5,0.9548,57.5,49.1,17.6,...,27.0,31.7,54.1,48.1,41.8,34.0,68.8,6.6,Champions,1.0
2,Illinois,B10,29,23,117.7,90.4,0.9539,55.6,46.6,18.2,...,39.2,30.5,55.3,45.4,37.6,32.7,70.7,8.9,R32,1.0
3,Gonzaga,WCC,26,26,125.4,89.8,0.9791,61.0,47.5,16.1,...,36.7,25.9,64.0,46.8,36.5,32.5,74.6,8.5,2ND,1.0
4,Iowa,B10,29,21,123.5,95.7,0.9491,54.6,48.3,13.3,...,32.0,22.6,52.4,45.8,38.6,34.8,70.0,5.6,R32,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3239,North Carolina A&T,MEAC,36,19,90.6,95.7,0.3459,45.7,45.2,22.7,...,42.6,41.1,45.4,43.4,30.7,32.0,66.9,-11.5,R64,16.0
3240,Southern,SWAC,30,20,95.9,96.7,0.4750,49.1,41.9,16.8,...,40.3,34.9,46.0,40.5,36.7,30.0,65.0,-5.7,R64,16.0
3241,LIU Brooklyn,NEC,34,20,108.1,111.2,0.4195,54.4,52.6,20.3,...,45.6,31.6,53.1,51.1,38.0,37.2,69.3,-8.6,R68,16.0
3242,James Madison,CAA,36,21,98.5,99.4,0.4736,47.6,48.7,17.2,...,36.8,39.2,46.2,46.9,33.6,34.7,64.2,-8.9,R64,16.0


In [18]:
cbb24 = pd.read_csv("../Final_Project_DE/archive/cbb24.csv")

In [19]:
logging.info("Processing cbb24 dataframe.")
cbb24["EFG_O"] = cbb24["EFG%"]
cbb24["EFG_D"] = cbb24["EFGD%"]
cbb24 = cbb24[
    [
        "TEAM",
        "CONF",
        "G",
        "W",
        "ADJOE",
        "ADJDE",
        "BARTHAG",
        "EFG_O",
        "EFG_D",
        "TOR",
        "TORD",
        "ORB",
        "DRB",
        "FTR",
        "FTRD",
        "2P_O",
        "2P_D",
        "3P_O",
        "3P_D",
        "ADJ_T",
        "WAB",
        "SEED",
    ]
]

In [20]:
logging.info("Saved train_data.csv and test_data.csv successfully.")
combined_df.to_csv("train_data.csv", index=False)
cbb24.to_csv("test_data.csv", index=False)