In [2]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import os

# Set display options
sns.set(style="whitegrid")

In [3]:
# Directory containing the game log files
data_dir = '../data/'

# List all files in the directory
files = os.listdir(data_dir)

In [4]:
# Filter out the game log files
game_log_files = [f for f in files if f.endswith('_game_log.csv')]

# Initialize an empty list to hold DataFrames
df_list = []

# Loop through each file and read it into a DataFrame
for file in game_log_files:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    df_list.append(df)
    
# Concatenate all DataFrames into a single DataFrame
game_log = pd.concat(df_list, ignore_index=True)

# Load the total team scoring data
total_team_scoring = pd.read_csv(os.path.join(data_dir, 'total_team_scoring.csv'))

# Load sentiment data
sentiment_data = pd.read_csv(os.path.join(data_dir, 'nba_tweets.csv'))

EmptyDataError: No columns to parse from file

In [5]:
# Set display options
sns.set(style="whitegrid")

# Display the first few rows of each dataset for inspection
print("Game Log Data:")
display(game_log.head())

print("Total Team Scoring Data:")
display(total_team_scoring.head())

print("Sentiment Data:")
display(sentiment_data.head())

Game Log Data:


Unnamed: 0,Date,Opponent,OffPoss,Points,FG2M,FG2A,Fg2Pct,FG3M,FG3A,Fg3Pct,...,Assisted3sPct,FG3APct,ShotQualityAvg,EfgPct,TsPct,PtsPutbacks,Fg2aBlocked,FG2APctBlocked,Fg3aBlocked,FG3APctBlocked
0,2023-10-25,CHA,104,110,34,64,0.53125,5,29,0.172414,...,1.0,0.311828,0.534682,0.446237,0.514019,6,3,0.046875,0,0.0
1,2023-10-27,NYK,103,120,30,55,0.545455,12,32,0.375,...,0.833333,0.367816,0.587675,0.551724,0.594059,8,6,0.109091,0,0.0
2,2023-10-29,MIL,102,127,32,56,0.571429,15,37,0.405405,...,0.8,0.397849,0.534029,0.586022,0.61165,8,3,0.053571,0,0.0
3,2023-10-30,MIN,97,127,34,56,0.607143,14,30,0.466667,...,0.714286,0.348837,0.561111,0.639535,0.672043,4,5,0.089286,0,0.0
4,2023-11-01,WAS,110,130,37,60,0.616667,9,32,0.28125,...,1.0,0.347826,0.567445,0.548913,0.619048,14,3,0.05,0,0.0


Total Team Scoring Data:


Unnamed: 0,Name,GamesPlayed,OffPoss,Points,FG2M,FG2A,Fg2Pct,FG3M,FG3A,Fg3Pct,...,Assisted3sPct,FG3APct,ShotQualityAvg,EfgPct,TsPct,PtsPutbacks,Fg2aBlocked,FG2APctBlocked,Fg3aBlocked,FG3APctBlocked
0,DEN,82,7913,9418,2652,4719,0.561983,958,2560,0.374219,...,0.864301,0.351697,0.535417,0.561753,0.588331,496,385,0.081585,9,0.003516
1,GSW,82,8174,9657,2371,4324,0.548335,1211,3191,0.379505,...,0.810074,0.424617,0.5199,0.557219,0.586236,390,378,0.087419,29,0.009088
2,PHX,82,8076,9532,2462,4392,0.560565,1020,2671,0.381879,...,0.863725,0.378168,0.531708,0.565199,0.601447,290,349,0.079463,23,0.008611
3,MIA,82,7891,9032,2244,4257,0.527132,1022,2765,0.36962,...,0.892368,0.393762,0.520038,0.537881,0.575577,326,358,0.084097,30,0.01085
4,LAL,82,8301,9679,2611,4605,0.566992,969,2572,0.37675,...,0.822497,0.358367,0.534564,0.566323,0.601949,372,373,0.080999,17,0.00661


Sentiment Data:


NameError: name 'sentiment_data' is not defined

In [6]:
# Data Cleaning
# Drop duplicates
game_log.drop_duplicates(inplace=True)
total_team_scoring.drop_duplicates(inplace=True)
sentiment_data.drop_duplicates(inplace=True)

# Convert date columns to datetime
game_log['date'] = pd.to_datetime(game_log['date'])
total_team_scoring['date'] = pd.to_datetime(total_team_scoring['date'])
sentiment_data['created_at'] = pd.to_datetime(sentiment_data['created_at']).dt.date

# Handle missing data
total_team_scoring.fillna({'points_scored': 0}, inplace=True)

NameError: name 'sentiment_data' is not defined

In [7]:
# Aggregate sentiment data by date
sentiment_agg = sentiment_data.groupby('created_at').agg({
    'sentiment': lambda x: (x == 'positive').sum(),
    'sentiment': lambda x: (x == 'negative').sum(),
    'sentiment': lambda x: (x == 'neutral').sum()
}).reset_index()

sentiment_agg.rename(columns={'sentiment': 'positive', 'sentiment': 'negative', 'sentiment': 'neutral'}, inplace=True)
sentiment_agg['total'] = sentiment_agg[['positive', 'negative', 'neutral']].sum(axis=1)
sentiment_agg['positive_percentage'] = sentiment_agg['positive'] / sentiment_agg['total'] * 100
sentiment_agg['negative_percentage'] = sentiment_agg['negative'] / sentiment_agg['total'] * 100

# Merge sentiment data with team scoring data
merged_data = pd.merge(total_team_scoring, sentiment_agg, left_on='date', right_on='created_at', how='left')

# Data Analysis
# Calculate correlations
correlations = merged_data[['points_scored', 'positive_percentage', 'negative_percentage']].corr()

# Plot correlations
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix: Points Scored vs Sentiment Percentages')
plt.show()

print("Correlation Matrix:")
display(correlations)

NameError: name 'sentiment_data' is not defined