In [1]:
import pandas as pd

# Load data
df = pd.read_csv("oddsData.csv")

# Create a consistent game_id: sorted teams + date
df['team_pair'] = df[['team', 'opponent']].apply(lambda x: tuple(sorted(x)), axis=1)
df['game_id'] = df['team_pair'].astype(str) + '_' + df['date'].astype(str)

# Drop duplicates (only one row per game is needed)
df = df.drop_duplicates(subset='game_id', keep='first')

# Assign teamA, teamB in alphabetical order
df[['teamA', 'teamB']] = pd.DataFrame(df['team_pair'].tolist(), index=df.index)

# Determine if teamA is the home team
df['teamA_home'] = ((df['team'] == df['teamA']) & (df['home/visitor'] == 'vs')) | \
                   ((df['team'] == df['teamB']) & (df['home/visitor'] == '@'))
df['teamA_home'] = df['teamA_home'].astype(int)

# Assign scores and moneylines using vectorized logic
is_teamA = df['team'] == df['teamA']

df['teamA_score'] = df['score'].where(is_teamA, df['opponentScore'])
df['teamB_score'] = df['opponentScore'].where(is_teamA, df['score'])

df['teamA_moneyLine'] = df['moneyLine'].where(is_teamA, df['opponentMoneyLine'])
df['teamB_moneyLine'] = df['opponentMoneyLine'].where(is_teamA, df['moneyLine'])

# Final cleaned dataframe
cleaned_df = df[[
    'date', 'season', 'teamA', 'teamB', 'teamA_home',
    'teamA_score', 'teamB_score',
    'teamA_moneyLine', 'teamB_moneyLine',
    'total', 'spread', 'secondHalfTotal'
]]

# Optional save or display
cleaned_df.to_csv("cleaned_odds_data.csv", index=False)
print(cleaned_df.head())


         date  season         teamA        teamB  teamA_home  teamA_score  \
0  2007-10-30    2008  Golden State         Utah           1           96   
1  2007-10-30    2008       Houston    LA Lakers           0           95   
3  2007-10-30    2008      Portland  San Antonio           0           97   
6  2007-10-31    2008       Chicago   New Jersey           0          103   
7  2007-10-31    2008     Cleveland       Dallas           1           74   

   teamB_score  teamA_moneyLine  teamB_moneyLine  total  spread  \
0          117           -120.0            100.0  212.0     1.0   
1           93           -230.0            190.0  199.0     5.0   
3          106            900.0          -1400.0  189.5   -13.0   
6          112            105.0           -125.0  186.0    -1.5   
7           92            120.0           -140.0  184.0    -2.5   

   secondHalfTotal  
0            105.5  
1             99.0  
3             95.0  
6             94.0  
7             91.5  
