## nba

In [None]:
import pandas as pd

data = pd.read_csv('./04/nba_all_elo.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data["lg_id"].value_counts()

In [None]:
data["team_id"].value_counts()

In [None]:
data["fran_id"].value_counts()

In [None]:
data.loc[data["fran_id"] == "Lakers", "team_id"].value_counts()

In [None]:
data.loc[data["team_id"] == "MNL", "year_id"].max()

In [None]:
data_for_huskies = data[data["fran_id"] == "Huskies"]
data_for_huskies.loc[data_for_huskies["year_id"] == 1947, "pts"].sum()

In [None]:
data_for_huskies.axes

## loc / iloc

In [None]:
vocals = pd.Series(["a", "b", "c", "d", "e"], index=[1, 3, 5, 7, 9])
print(vocals)

In [None]:
vocals.loc[5]

In [None]:
vocals.iloc[2]

In [None]:
data.iloc[-1]

In [None]:
data.loc[1000:1005, ["fran_id", "team_id"]]

In [None]:
data_after_2012 = data[data["year_id"] > 2012]
data_after_2012.shape

In [None]:
data_with_notes = data[data["notes"].notnull()]
data_with_notes.shape

In [None]:
# game_id termina en CLE, cuantos son

data_game_id_ends_in_CLE = data[data["game_id"].str.endswith("CLE")]
data_game_id_ends_in_CLE.shape

In [None]:
data[
    (data["pts"] > 100) &
    (data["fran_id"] == "Lakers") & 
    (data["_iscopy"] == 0)
]

In [None]:
data["pts"].min()

In [None]:
data.groupby("fran_id", sort=True)["pts"].sum()

In [None]:
data_2 = data.copy()
data_2.shape

In [None]:
data["difference_in_points"] = data.pts - data.opp_pts

In [None]:
data.difference_in_points.head()

In [None]:
data_with_better_names = data.rename(
    columns = {
        "game_result": "result",
        "game_location": "location",
    }
)

In [None]:
data_with_better_names.info()

In [None]:
data_without_missing_data = data.dropna()

In [None]:
data_without_missing_data.shape

In [None]:
data_with_default_values = data.copy()
data_with_default_values["notes"].fillna(value="NO DATA", inplace=True)

In [None]:
data_with_default_values["notes"].describe()

In [None]:
%matplotlib inline
data[data["fran_id"] == "Lakers"].groupby("year_id")["pts"].sum().plot()

In [None]:
%matplotlib inline
data[data["fran_id"] == "Lakers"].groupby("year_id")["pts"].sum().plot(kind="bar")

## model

In [None]:
import pandas as pd

data = pd.read_csv('./04/nba_all_elo.csv')
data.head()

In [None]:
data.info()

In [None]:
data = data.drop(columns=['notes'])
data.info()

In [None]:
data = data.drop(columns=['gameorder', 'game_id'])
data.info()

In [None]:
data["lg_id"].value_counts()

In [None]:
data = data.drop(columns=['_iscopy', 'seasongame', 'fran_id', 'opp_fran'])
data.info()

In [None]:
data = data.drop(columns=['year_id'])
data.info()

In [None]:
data['date_game'] = pd.to_datetime(data['date_game'])

In [None]:
data.info()

In [None]:
print(data.select_dtypes(include="object").columns)

In [None]:
data['lg_id'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for x in ['lg_id', 'game_location', 'game_result']:
    data[x] = label_encoder.fit_transform(data[x].astype(str))

In [None]:
data.info()

In [None]:
print(data.select_dtypes(include="object").columns)

In [None]:
# data = pd.get_dummies(data, columns=['team_id', 'opp_id'], drop_first=True)
# data.head()

In [None]:
data = data.drop(columns=['team_id', 'opp_id'])
data.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

In [None]:
features = data[['pts', 'win_equiv', 'game_location', 'forecast']]
target = data.game_result

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

reg_log_model = LogisticRegression()
reg_log_model.fit(x_train, y_train)
reg_log_model_predictions = reg_log_model.predict(x_test)

print(confusion_matrix(y_test, reg_log_model_predictions))

In [None]:
print(classification_report(y_test, reg_log_model_predictions))

## exercises

In [None]:
import pandas as pd

data = pd.read_csv('./04/nba_all_elo.csv')
data.head()

In [None]:
%matplotlib inline
data[data['fran_id'] == "Celtics"].groupby("year_id")["pts"].min().plot()

In [None]:
data[(data['fran_id'] == "Celtics") & (data['game_result'] == "L")].groupby("year_id").size()

In [None]:
data[(data['fran_id'] == "Celtics") & (data['game_result'] == "W")].groupby("year_id").size()

In [None]:
data['date_game'] = pd.to_datetime(data['date_game'])
data.groupby('fran_id')["date_game"].max()