In [89]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBClassifier


In [None]:
#function to solve the math-expression problem (optional)

def average_damage(dice_notation):
    
    parts = dice_notation.split('d')
    rolls = int(parts[0]) if parts[0] else 1
    sides_modifier = parts[1].split('+')

    sides = int(sides_modifier[0])
    modifier = int(sides_modifier[1]) if len(sides_modifier) > 1 else 0

    total_damage = 0
    num_simulations = 333  

    for _ in range(num_simulations):
        damage = sum(random.randint(1, sides) for _ in range(rolls))
        total_damage += damage

    average = (total_damage / num_simulations) + modifier
    return average

In [None]:
# dice = '6d6+2'
# avg = average_damage(dice)
# print(f'Average damage for {dice} is {avg}')


In [4]:
def wrangler(df):

    df['Rarity'] = df['Rarity'].str.replace('Rank ', '').astype(float) # take out the word 'Rank', the whitespace, and turn into integers
    df = df.drop(axis=1, columns=['Timestamp', 'Unnamed: 0', 'Damage', 'Name', 'Type']) # drop the Timestamp, Unnamed, Name, Type, and Damage columns
    # df['Damage'] = df['Damage'].apply(average_damage) # apply the average_damage function to the Damage column (optional)

    return df

In [5]:
data = pd.read_csv('C:/users/fradi/onedrive/desktop/bandersnatch.csv')
df = wrangler(data)

df

Unnamed: 0,Level,Rarity,Health,Energy,Sanity
0,6,2.0,36.69,37.55,35.67
1,3,1.0,11.37,10.09,11.48
2,2,2.0,14.79,12.92,11.12
3,7,1.0,26.34,28.46,27.39
4,4,4.0,36.96,44.71,40.04
...,...,...,...,...,...
3995,14,0.0,28.06,28.43,28.31
3996,11,0.0,21.14,21.42,21.57
3997,8,0.0,15.58,15.81,16.74
3998,16,1.0,62.97,64.46,62.49


In [6]:
#SPLIT

target = 'Rarity'
X = df.drop(columns = target, axis = 1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [52]:
baseline_acc = y.value_counts(normalize = True).max()
print('Baseline Accuracy:', baseline_acc)

Baseline Accuracy: 0.30525


In [64]:
model_bag = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    RandomForestClassifier(
        random_state=6, 
        n_estimators=9, 
        n_jobs=-1,
        max_depth=19
    )
)               

model_bag.fit(X_train, y_train);

print('Bag Model')
print('Training accuracy:', model_bag.score(X_train, y_train))
print('Validation accuracy:', model_bag.score(X_test, y_test))

Bag Model
Training accuracy: 0.339375
Validation accuracy: 0.28875


In [76]:
model_boost = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    XGBClassifier(
        random_state=6, 
        n_estimators=9, 
        n_jobs=-1,
        max_depth=3
    )
)
model_boost.fit(X_train, y_train);

print('Boost Model')
print('Training accuracy:', model_boost.score(X_train, y_train))
print('Validation accuracy:', model_boost.score(X_test, y_test))

Boost Model
Training accuracy: 0.3203125
Validation accuracy: 0.2875


In [93]:
model_boost2 = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    GradientBoostingClassifier(
        random_state=11, 
        n_estimators=11, 
        max_depth=88
    )
)
model_boost2.fit(X_train, y_train);

print('Boost2 Model')
print('Training accuracy:', model_boost2.score(X_train, y_train))
print('Validation accuracy:', model_boost2.score(X_test, y_test))

Boost2 Model
Training accuracy: 0.99125
Validation accuracy: 0.30625


In [97]:
param_grid = {
    'gradientboostingclassifier__n_estimators': [1, 5, 11],
    'gradientboostingclassifier__max_depth': [20, 50, 90]
}

grid_search = GridSearchCV(model_boost2, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and best score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
