# Feature Engineering

In [None]:
import pandas as pd
import numpy as np

### Initialize Columns for our Features

In [None]:
df = pd.read_csv("ArtifactDataset")
df = df.drop(columns = "Unnamed: 0")
df['v_elo'] = 1500
df['h_elo'] = 1500
df['h_winstreak'] = 0
df['v_winstreak'] = 0


#### Creating ELO Feature as a Weighted Measure of Performance

In [None]:
def find_last_game_elo(df, index, team):
    for i in range(index -1, -1, -1):
        row = df.iloc[i]
        visitor_team = row['Visitor']
        home_team = row['Home']
        visitor_elo = row['v_elo']
        home_elo = row['h_elo']
        if visitor_team == team:
            return visitor_elo
        elif home_team == team:
            return home_elo
    return 1500


In [None]:
def calculate_elo_game(df, index, game, visitor, home):
    visitor_elo = find_last_game_elo(df, index, visitor)
    home_elo = find_last_game_elo(df, index, home)
    mov_const = ((20 * (abs(game['v_goals'] - game['h_goals']) + 1.5)) ** .8) / (7.5 + (visitor_elo - home_elo) * 0.0001)

    elo_change = mov_const/100

    if game['Winner'] == game['Visitor']:
        visitor_elo += elo_change
        home_elo -= elo_change
    else:
        visitor_elo -= elo_change
        home_elo += elo_change

    return visitor_elo, home_elo


In [None]:
def update_elo(df):
    for index, game in df.iterrows():
        visitor = game['Visitor']
        home = game['Home']

        visitor, home = calculate_elo_game(df, index, game, visitor, home)

        df.at[index, 'v_elo'] = visitor
        df.at[index, 'h_elo'] = home
    return df
update_elo(df)

In [None]:
def calculate_average(df, column1, column2):
    """
    Calculate the average of two different columns in a Pandas DataFrame.

    Parameters:
    df (pandas.DataFrame): the DataFrame to calculate the average on.
    column1 (str): the name of the first column to average.
    column2 (str): the name of the second column to average.

    Returns:
    float: the average of the two columns.
    """
    return pd.concat([df[column1], df[column2]], axis=1).mean().mean()


#### Creating Winstreak Feature to Measure Recent Performance

In [None]:
def find_last_game(df, index, team):
    for i in range(index - 1, -1, -1):
        row = df.iloc[i]
        if row['Visitor'] == team or row['Home'] == team:
            return row
    return None


In [None]:
def calculate_winstreak(df):
    for index, game in df.iterrows():
        h_game = find_last_game(df, index, game['Home'])
        v_game = find_last_game(df, index, game['Visitor'])
        if v_game is None:
            df.loc[index, "v_winstreak"] = 0
        else:
            df.loc[index, "v_winstreak"] = v_game['v_winstreak'] + 1 if v_game['Winner'] == game['Visitor'] else 0
        if h_game is None:
            df.loc[index, "h_winstreak"] = 0
        else:
            df.loc[index, "h_winstreak"] = h_game['h_winstreak'] + 1 if h_game['Winner'] == game['Home'] else 0
    return df
calculate_winstreak(df)

In [None]:
df.to_csv("EngineeredData")

### Principal Component Analysis

#### Note: Didn't end up using PCA in creating the model, but it was fun to learn about

#### Preparation of Data for PCA

In [None]:
engineered_data = pd.read_csv("EngineeredData")
raw_data = engineered_data.drop(columns= ['Home', 'Visitor', 'Winner', 'Unnamed: 0', 'Date'])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA



# scale the features
scaler = StandardScaler()
scaled_df = scaler.fit_transform(raw_data)

# perform PCA
pca = PCA(n_components=10)
principal_components = pca.fit_transform(scaled_df)


#### Outputs the Percent of Variance explained by each Principal Component

In [None]:
explained_variance = pca.explained_variance_ratio_
explained_variance


#### Short Script Outputting Original Feature Whose Coefficient is Greatest for each Principal Component

In [None]:
n_pcs = pca.components_.shape[0]

most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = list(raw_data.columns)
# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

# LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}

# build the dataframe
out = pd.DataFrame(dic.items())
out

#### Color Coded Plot in 3 Dimensions - Blue is Home Team WIN, Red Visitor Team WIN

In [None]:
import matplotlib.pyplot as plt

# Extract x, y, and z values from the principal components
x = principal_components[:, 0]
y = principal_components[:, 1]
z = principal_components[:, 2]

# Create a list of colors based on the winner of the game
colors = []
for index, game in engineered_data.iterrows():
    if game['Winner'] == game['Home']:
        colors.append('b')  # blue for home win
    else:
        colors.append('r')  # red for visitor win

# Create a 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Add data points to the plot with the corresponding color
ax.scatter(x, y, z, c=colors, marker='o')

# Add labels to the x, y, and z axes
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')


#### Again, did not use, but this is projecting the original data onto principal component axes

In [None]:
projected_data = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
projected_data.to_csv("ProjectedData", index=False)