In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from textblob import TextBlob
from nba_api.stats.endpoints import leaguedashteamstats
import re

In [9]:
# List of seasons to loop through
seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(2016, 2025)]

# Dictionary to hold data for each season
season_data_frames = {}

# Loop through each season and fetch data
for season in seasons:
    print(f"Fetching data for season: {season}")
    df = leaguedashteamstats.LeagueDashTeamStats(
        season=season,
        per_mode_detailed='PerGame'
    ).get_data_frames()[0]
    season_data_frames[season] = df

df_2016_17 = season_data_frames['2016-17']
df_2016_17.head()


Fetching data for season: 2016-17
Fetching data for season: 2017-18
Fetching data for season: 2018-19
Fetching data for season: 2019-20
Fetching data for season: 2020-21
Fetching data for season: 2021-22
Fetching data for season: 2022-23
Fetching data for season: 2023-24
Fetching data for season: 2024-25


Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,...,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK
0,1610612737,Atlanta Hawks,82,43,39,0.524,48.5,38.1,84.4,0.451,...,9,10,28,7,14,22,6,3,22,19
1,1610612738,Boston Celtics,82,53,29,0.646,48.2,38.6,85.1,0.454,...,27,4,8,18,23,23,21,11,7,8
2,1610612751,Brooklyn Nets,82,20,62,0.244,48.2,37.8,85.2,0.444,...,10,20,29,21,18,30,25,9,12,29
3,1610612766,Charlotte Hornets,82,36,46,0.439,48.4,37.7,85.4,0.442,...,16,11,1,27,17,28,1,15,16,15
4,1610612741,Chicago Bulls,82,41,41,0.5,48.2,38.6,87.1,0.444,...,3,14,12,15,16,12,2,25,23,14


In [25]:
team_name_to_abbr = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BKN",
    "Charlotte Hornets": "CHA",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "LA Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves": "MIN",
    "New Orleans Pelicans": "NOP",
    "New York Knicks": "NYK",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHX",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Washington Wizards": "WAS"
}

# Assuming 'season_data_frames' is a dictionary of DataFrames for each season
for season, df in season_data_frames.items():
    df['TEAM_ABBREVIATION'] = df['TEAM_NAME'].map(team_name_to_abbr)


In [26]:
def clean_and_extract_team_id(team_name):
    # Remove seed numbers and parentheses
    cleaned_name = re.sub(r'\s\(\d+\)', '', team_name)
    # Extract first three letters and convert to uppercase
    team_id = cleaned_name[:3].upper()
    return team_id

relabeled_dict = {
    "Team": "Team1",
    "Team.1": "Team2",
    "W": "Team1wins",
    "W.1": "Team2wins"
}
playoff_results = pd.read_csv('data/playoff_results.csv') 
playoff_results = playoff_results.rename(columns=relabeled_dict)
playoff_results['Team1'] = playoff_results['Team1'].apply(clean_and_extract_team_id)
playoff_results['Team2'] = playoff_results['Team2'].apply(clean_and_extract_team_id) 
playoff_results.replace('CHA', 'CHO', inplace=True)


# Reshape the DataFrame so that both Team1 and Team2 are treated as teams with their respective wins
team1_wins = playoff_results[['Yr', 'Team1', 'Team1wins']].rename(columns={'Team1': 'Team', 'Team1wins': 'wins'})
team2_wins = playoff_results[['Yr', 'Team2', 'Team2wins']].rename(columns={'Team2': 'Team', 'Team2wins': 'wins'})

# Combine both team1 and team2 data into one DataFrame
combined_wins = pd.concat([team1_wins, team2_wins])

# Group by Year and Team, and sum the wins for each team
team_totals = combined_wins.groupby(['Yr', 'Team'], as_index=False)['wins'].sum()
team_totals.sample(10)

Unnamed: 0,Yr,Team,wins
21,2017,HOU,6
56,2019,MIL,10
125,2024,BOS,16
75,2020,PHI,0
49,2019,BRO,1
16,2017,ATL,2
116,2023,LOS,9
37,2018,MIA,1
50,2019,DEN,7
47,2018,WAS,2


In [60]:
team_totals['Yr'] = team_totals['Yr'].astype(str)

# dictionary to hold the merged DataFrames
merged_season_data = {}

# loop through each season in season_data_frames
for season, df in season_data_frames.items():
    # Extract the year from the season string (ex. '2016-17' -> '2016')
    year = season.split('-')[0]
    
    # Merge the current season DataFrame with team_totals 
    merged_df = df.merge(team_totals, how='left', left_on=['TEAM_NAME'], right_on=['Team'])
    merged_df = merged_df[merged_df['Yr'] == year] # Filter rows where 'Yr' matches the current year
    
    merged_df = merged_df.drop(columns=['Yr', 'Team'], errors='ignore') # Drop unnecessary columns and duplicates
    
    merged_season_data[season] = merged_df #add merged DataFrame to the dictionary

all_merged_data = pd.DataFrame()
for season, df in season_data_frames.items():
    merged_df = df.merge(team_totals, how='left', left_on='TEAM_ABBREVIATION', right_on='Team')
    all_merged_data = pd.concat([all_merged_data, merged_df], ignore_index=True)

all_merged_data.head()
all_merged_data.columns

Index(['TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS',
       'PLUS_MINUS', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK',
       'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK',
       'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK',
       'DREB_RANK', 'REB_RANK', 'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK',
       'BLKA_RANK', 'PF_RANK', 'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK',
       'TEAM_ABBREVIATION', 'Yr', 'Team', 'wins'],
      dtype='object')

In [41]:
#CORRELATIONS FOR EACH FEATURE AND PLAYOFF WINS

# List of columns to exclude (categorical columns)
exclude_columns = ['TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'Yr', 'Team']

# Select only numerical columns (excluding categorical ones)
numerical_columns = all_merged_data.drop(columns=exclude_columns)

# Apply absolute value to all rank columns, excluding TOV_RANK
for col in numerical_columns.columns:
    if col.endswith('_RANK') and col != 'TOV_RANK':
        numerical_columns[col] = numerical_columns[col].abs()

# Calculate correlation between each numerical column and 'wins'
correlation_with_wins = numerical_columns.corr()['wins']

# Sort the correlations from highest to lowest
sorted_correlation = correlation_with_wins.sort_values(ascending=False)

# Print or store the sorted correlation results
print(sorted_correlation)


wins               1.000000
W_PCT              0.197795
PLUS_MINUS         0.196656
W                  0.158908
FG3M               0.142577
FG3_PCT            0.122367
FG3A               0.116746
FG_PCT             0.096183
MIN                0.087486
PTS                0.073994
FTA_RANK           0.064780
DREB               0.064030
AST                0.060469
FGA_RANK           0.059587
BLK_RANK           0.054659
FTM_RANK           0.041975
FGM                0.037087
REB                0.035076
STL_RANK           0.031223
OREB_RANK          0.029431
GP_RANK            0.024879
FT_PCT             0.022826
PFD_RANK           0.004420
GP                -0.001187
PFD               -0.001286
FT_PCT_RANK       -0.025254
FTM               -0.025305
STL               -0.031195
FTA               -0.035996
OREB              -0.039076
FGA               -0.042099
FGM_RANK          -0.044630
REB_RANK          -0.052202
AST_RANK          -0.070324
MIN_RANK          -0.071369
BLK               -0

In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Assuming your dataframe is called 'all_merged_data'
df = all_merged_data.copy()

# fill missing values in the wins col with the mean
imputer = SimpleImputer(strategy='mean')
df['wins'] = imputer.fit_transform(df[['wins']])


X = df.drop(columns=['wins', 'TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'Yr', 'Team'])  # Drop non-numeric columns
y = df['wins']

# split the data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# training initial random forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# evaluating the initial model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Initial Model - Mean Squared Error: {mse}")
print(f"Initial Model - R²: {r2}")

# selecting most correlated features 
selector = SelectFromModel(rf_model, threshold="mean", max_features=10, importance_getter="auto")
X_train_selected = selector.fit_transform(X_train, y_train)

# training the model again with selected features
rf_model.fit(X_train_selected, y_train)

# evaluating model with selected features
X_test_selected = selector.transform(X_test)
y_pred_selected = rf_model.predict(X_test_selected)

# evaluation of the selected feature model
mse_selected = mean_squared_error(y_test, y_pred_selected)
r2_selected = r2_score(y_test, y_pred_selected)

print(f"Selected Features - Mean Squared Error: {mse_selected}")
print(f"Selected Features - R²: {r2_selected}")




Initial Model - Mean Squared Error: 23.131884256508275
Initial Model - R²: -0.06274061287831723
Selected Features - Mean Squared Error: 23.10732171581938
Selected Features - R²: -0.061612144083639686


In [31]:
reddit_summary = pd.read_csv('data/reddit_nbathunder.csv')

In [32]:
print(f"reddit_summary_columns: {reddit_summary.columns}")

reddit_summary_columns: Index(['Title', 'ID', 'Author', 'Name', 'Author Flair Text', '# Comments',
       'Time', '# Upvotes', 'Link', 'Upvote Ratio'],
      dtype='object')


In [33]:
reddit_summary.head()

Unnamed: 0,Title,ID,Author,Name,Author Flair Text,# Comments,Time,# Upvotes,Link,Upvote Ratio
0,Daily Discussion Thread + Game Thread Index,1hybybh,NBA_MOD,t3_1hybybh,r/NBA,5,1736536000.0,9,https://www.reddit.com/r/nba/comments/1hybybh/...,0.92
1,Weekly Friday Self-Promotion and Fan Art Thread,1hy3vjo,NBA_MOD,t3_1hy3vjo,r/NBA,0,1736514000.0,6,https://www.reddit.com/r/nba/comments/1hy3vjo/...,0.88
2,Jimmy Butler on his Instagram story to his Big...,1hy64xk,YujiDomainExpansion,t3_1hy64xk,,1476,1736521000.0,11827,https://streamable.com/9zqmf3,0.93
3,"Jarrett Allen explains Ethical Basketball: ""Fa...",1hxwrre,2131andBeyond,t3_1hxwrre,:cle-5: Cavaliers,409,1736485000.0,9988,https://streamable.com/uo94x4,0.97
4,Joe Mazzulla goes in depth on what his typical...,1hxy23z,SliMShady55222,t3_1hxy23z,:sea-1: Supersonics,252,1736490000.0,6170,https://streamable.com/u9yy46,0.98


In [37]:
import praw
from textblob import TextBlob
import pandas as pd

# Setup PRAW Reddit API client with your credentials
reddit = praw.Reddit(client_id='your_client_id', 
                     client_secret='your_client_secret',
                     user_agent='your_user_agent')


# Function to fetch Reddit post content
def get_reddit_post_content(url):
    try:
        post = reddit.submission(url=url)
        post_content = post.selftext  # Fetches the main content of the post
        return post_content
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to perform sentiment analysis using TextBlob
def get_sentiment_score(text):
    if text:
        blob = TextBlob(text)
        sentiment_score = blob.sentiment.polarity  # Returns a score between -1 (negative) and 1 (positive)
        return sentiment_score
    return None

# Extract content and perform sentiment analysis
reddit_summary['Post Content'] = reddit_summary['Link'].apply(get_reddit_post_content)
reddit_summary['Sentiment Score'] = reddit_summary['Post Content'].apply(get_sentiment_score)

# Show the DataFrame with sentiment scores
print(reddit_summary[['Link', 'Sentiment Score']])


Error fetching https://www.reddit.com/r/nba/comments/1hybybh/...: received 401 HTTP response
Error fetching https://www.reddit.com/r/nba/comments/1hy3vjo/...: received 401 HTTP response
Error fetching https://www.reddit.com/r/nba/comments/1hy64xk/...: received 401 HTTP response
                                                Link Sentiment Score
0  https://www.reddit.com/r/nba/comments/1hybybh/...            None
1  https://www.reddit.com/r/nba/comments/1hy3vjo/...            None
2  https://www.reddit.com/r/nba/comments/1hy64xk/...            None
