In [6]:
import numpy as np
import pandas as pd
import pickleshare
import sklearn
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
import importlib
import climbing_ticks_helper as helper
importlib.reload(helper)
pd.set_option('display.expand_frame_repr', False) # display full data in terminal
%matplotlib inline


In [7]:

df = pd.read_csv('/app/model_ready_ticks.csv')

# first take at a Random Forest model:
X = df.drop(columns=['Attempts', 'Lead Style'])
y = df[['Attempts', 'Lead Style']]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_pred_df = pd.DataFrame(y_pred, columns=y_test.columns)

# compute accuracy, recall, precision, f1 score for multi class predictions
# Generate classification reports for each response variable
for resp in y_test.columns:
    print(f"Classification report for response variable {resp}:")
    # don't print macro avg or weighted avg
    cr = classification_report(y_test[resp], y_pred_df[resp], zero_division=0, output_dict=True)
    cr.pop('macro avg', None)
    cr.pop('weighted avg', None)
    prettytable = pd.DataFrame(cr).T
    print(prettytable.round(2))
    print("\n");


Classification report for response variable Attempts:
          precision  recall  f1-score  support
1              0.88    0.98      0.93   182.00
2              0.00    0.00      0.00    18.00
3              0.00    0.00      0.00     3.00
4              0.00    0.00      0.00     2.00
5              0.00    0.00      0.00     1.00
accuracy       0.86    0.86      0.86     0.86


Classification report for response variable Lead Style:
          precision  recall  f1-score  support
0              0.87    0.91      0.89   153.00
1              0.00    0.00      0.00     8.00
2              0.62    0.62      0.62    45.00
accuracy       0.81    0.81      0.81     0.81




In [8]:
df_readable = pd.read_csv('/app/grouped_ticks.csv')
df_readable = df_readable[df_readable['RouteID'].isin(X_test['RouteID'])].set_index('RouteID').loc[X_test['RouteID']].reset_index()
df_combined = helper.combine_predictions_with_data(df_readable, y_pred_df)

# print all rows that got Lead Style wrong
wrong_preds = df_combined[df_combined['Lead Style'] != df_combined['Predicted Lead Style']]
right_preds = df_combined
print(wrong_preds)

                           Route    RouteID        Date Route Type  Alpine Safety  Avg Stars  Pitches   Rating Predicted Lead Style     Lead Style  Predicted Attempts  Attempts
0             Bolt From the Blue  106784951  2024-03-07      Sport       0      G        3.7        1    5.12a            Fell/Hung  Onsight/Flash                   1         1
4                       Fantasia  105753880  2024-11-30      Sport       0      G        3.1        1    5.11c            Fell/Hung  Onsight/Flash                   1         1
5         The Richness of It All  107019607  2023-11-05      Sport       0      G        3.7        4    5.12a            Fell/Hung       Redpoint                   1         1
6                    Moscow Mule  114140845  2022-06-19      Sport       0      G        2.3        1    5.11d            Fell/Hung  Onsight/Flash                   1         1
8         Mrs. Hen Places a Peck  105748591  2023-03-19      Sport       0      G        2.7        1    5.11d     

In [10]:
# TODO:
# * categorize predictions that are close to the correct answer. EG: predicting an Onsight/Flash when the actual is a Redpoint with only 2 attempts isn't that bad. but predicting an Onsight/Flash when the actual is a Fell/Hung with 5 attempts is bad.
# * categorize predictions that though I woudl do better, vs predictions that thought I would do worse

def store_vars_for_sharing():
    %store rf
    %store X_train
    %store X_test
    %store y_train
    %store y_test
    %store df
    %store df_readable

store_vars_for_sharing()

Stored 'rf' (RandomForestClassifier)
Stored 'X_train' (DataFrame)
Stored 'X_test' (DataFrame)
Stored 'y_train' (DataFrame)
Stored 'y_test' (DataFrame)
Stored 'df' (DataFrame)
Stored 'df_readable' (DataFrame)
