### Introduction
The survey data is very messy and requires cleaning. A lot of questions were "open-ended", and users often formatted their responses differently, for example using different units. This notebook cleans the messy survey data, so that it can be easily analysed (see analysis.ipynb).

### Imports
All imports are included here:

In [1]:
import numpy as np
import pandas as pd

### Read in data

In [2]:
df = pd.read_excel("C:/Users/Danie/OneDrive/Documents/jupyter_notebooks/climb_harder_analysis/climbharder_survey.xlsx")

### Data cleaning
#### Rename columns

In [3]:
colname_dict = {
    "Timestamp": "timestamp",
    "Sex": "sex",
    "Height (cm)": "height_cm",
    "Weight (KG)": "weight_kg",
    "Arm Span (cm)": "arm_span_cm",
    "How long have you been climbing for?": "climbing_years",
    "Where do you climb?": "indoor_outdoor",
    "Hardest V Grade ever climbed ": "max_boulder_grade",
    "Hardest V Grade climbed in the Last 3 months": "max_boulder_grade_last_3_months",
    "The V grade you can send 90-100% of routes ": "consistently_send_boulder_grade",
    "Hardest Route grade climbed (Ewbank grade) ": "max_route_grade",
    "Hardest route climbed last 3 months (ewbank)": "max_route_grade_last_3_months",
    "Route grade you can send 90-100% of climbs": "consistently_send_route_grade",
    "Frequency of climbing sessions per week": "climbing_frequency",
    "Average hours climbing per week (not including training)": "climbing_hours",
    "Average hours Training for climbing per week ": "training_hours",
    "Hangboard Frequency per week ": "hangboard_frequency",
    "Hangboard grips used ": "hangboard_grips_trained",
    "Style of Hangboarding chosen ": "hangboarding_style",
    "Max Weight hangboard 18mm edge - Half crimp (KG)  (10 seconds) (added weight only)": "max_18_mm_hang_half_crimp_kg",
    "Max Weight hangboard 18mm edge - open crimp (KG) (10 seconds)  (added weight only)": "max_18_mm_hang_open_crimp_kg",
    "Min Edge used (mm, +kg if weight added ) - Half Crimp (10 seconds)": "min_edge_half_crimp_mm",
    "Min Edge used (mm, +kg if weight added) - Open crimp (10 seconds) ": "min_edge_open_crimp_mm",
    "Campus Board frequency per week ": "campus_frequency",
    "Campus Board time per week (hours)": "campus_hours",
    "Frequency of Endurance training sesions per week": "endurance_frequency",
    "Endurance training ": "endurance_style",
    "General Strength Training frequency per week ": "general_stength_frequency",
    "Time spent General strength training (hours)": "general_strength_hours",
    "Type of Strength training": "general_strength_style",
    "Other activities (ie yoga, cardio)": "other_training",
    "Max pull up reps": "max_pull_ups",
    "5 rep max weighted pull ups": "pull_up_5_rep_max_kg",
    "max push ups reps": "max_push_ups",
    "max L-sit time ": "max_l_sit_s",
}

df = df.rename(columns = colname_dict)

#### Replacing typos, and forcing consistent formatting.

In [4]:
df["sex"] = df["sex"] == "Male"  # Males == 1/True, Females == 0/False

# Clean height_cm
df["height_cm"] = df["height_cm"].replace("5 ft 8inches. Im amurican i dont know what centimeters are", 173) # Help the confused American.
df["height_cm"] = df["height_cm"].replace("cm", "", regex = True).astype(str).str.strip()
df["height_cm"] = df["height_cm"].astype(float)

# Clean weight_kg
replacements_dict = {"135 pounds....so....65 kg?": 61, "82,5": 83, "51-53...": 52, "~55": 55} # The same confused American
df["weight_kg"] = df["weight_kg"].replace(replacements_dict)
df["weight_kg"] = df["weight_kg"].replace("kg", "", regex = True).astype(str).str.strip()
df["weight_kg"] = df["weight_kg"].astype(float)

# Clean arm_span_cm
replacements_dict = {"161??": 161, "5 ft 10 inches": 178}
df["arm_span_cm"] = df["arm_span_cm"].replace(replacements_dict)
df["arm_span_cm"] = df["arm_span_cm"].replace(["-", "Dont know", "no idea", "?", "Not sure", "don't know", # Probably better to use an explicit list rather than extracting using isnumeric() == False,
                                             "unknown", "Unknown", "???", "**", "idk", "dunno", "Don't know"], np.nan)  # as it"s better to throw an error than accidentally convert data that could be useful to np.nan.
df["arm_span_cm"] = df["arm_span_cm"].replace("cm", "", regex = True).astype(str).str.strip()

# Clean climbing_years
df["climbing_years"] = df["climbing_years"].str.rstrip(" years")
df["climbing_years"] = df["climbing_years"].replace("More than 15", "15.25") # Set >15 to 15.25 (not ideal, but sensible)
df["climbing_years"] = df["climbing_years"].apply(lambda x: np.array(x.split(" - ")).astype(float).mean()) # Lots of categories so makes sense to handle as a regression problem using the midpoint

# Strip leading V in V grades and add np.nan for those that don't boulder
for col in ["max_boulder_grade", "max_boulder_grade_last_3_months", "consistently_send_boulder_grade"]:
    df[col] = df[col].str.lstrip("V")
    df[col] = df[col].replace("I don't boulder", np.nan)
    df[col] = pd.to_numeric(df[col]).astype("Int32")
    
# Add np.nan for those that don't route climb
for col in ["max_route_grade", "max_route_grade_last_3_months", "consistently_send_route_grade"]:
    df[col] = df[col].replace("I don't climb routes", np.nan)
    df[col] = pd.to_numeric(df[col]).astype("Int32")
    
# Deal with messy max pull ups data
replacements_dict = {
    ">20": 20, "12?  I don't work on bodyweight pullups for reps.": 12,
    "3 x 8": 8, "15-20": 18, "20?": 20, "maybe 5": 5, "15?": 15,
    "5, maybe, not sure": 5,"15+": 15, "Not sure... probably 12-15?": 14,
    "approx 25": 25, "20+": 20, "25ish": 25, "8-12": 10, "7?? ": 7}

df['max_pull_ups'] = df['max_pull_ups'].replace(replacements_dict)
df.loc[df['max_pull_ups'].str.isnumeric() == False, 'max_pull_ups'] = np.nan

# Lots of missing cases (and a measure of strength not a training technique, so less interested)
df = df.drop(columns=["max_18_mm_hang_half_crimp_kg", "max_18_mm_hang_open_crimp_kg", "min_edge_half_crimp_mm",
                      "min_edge_open_crimp_mm", "pull_up_5_rep_max_kg", "max_push_ups", "max_l_sit_s"])

### Extract some basic features
A few columns are comma seperated strings of training activities each individual does. We'll make a function to extract this into seperate columns for each activity.

In [5]:
def comma_sep_to_bool_cols(df, col_to_split, check_string_contains):
    """Takes column with comma seperated strings, checks the strings for items in check_string_contains list.
    converts column into multiple bool columns with colnames extracted from check_string_contains."""
    df=df.copy() # Small dataset so not worried about speed hit
    col_names_list = ["trains_{}".format(i.lower().replace(" ", "_")) for i in check_string_contains]
    for colname, string in zip(col_names_list, check_string_contains):
        df[colname] = col_to_split.str.contains(string)
    df = df.drop(columns=col_to_split.name)
    return df

In [6]:
# Seperate out csv columns to booleans
grip_list = ["Full Crimp", "Half Crimp", "Open Crimp", "Front 3", "Back 3", "Front 2", "Middle 2", "Back 2", "Slopers", "Pinch", "Monos"]
df = comma_sep_to_bool_cols(df, df["hangboard_grips_trained"], grip_list)

hangboarding_style = ["Repeaters", "Max weight", "Min Edge", "One arm hang", 'no hangs', "Other protocol"]
df = comma_sep_to_bool_cols(df, df['hangboarding_style'], hangboarding_style)

df["endurance_style"] = df["endurance_style"].fillna("None") # Assume those that didn't answer did no training
endurance_style = ["Laps of routes", "4x4", "ARC", "systems boards", "max moves", "route climbing intervals", "threshold intervals"]
df = comma_sep_to_bool_cols(df, df["endurance_style"], endurance_style)

strength_style = ["Antagonists", "Legs", "Core", "Upper body pulling", "Upper body pushing"]
df = comma_sep_to_bool_cols(df, df["general_strength_style"], strength_style)

# Convert indoor_outdoor into two booleans: climbs_indoors and climbs_outdoors
df["climbs_indoors"] = (df["indoor_outdoor"] == "Indoor and outdoor climbing") | (df["indoor_outdoor"] == "Indoor Climbing only")
df["climbs_outdoors"] = (df["indoor_outdoor"] == "Indoor and outdoor climbing") | (df["indoor_outdoor"] == "Outdoor Climbing only")
df = df.drop(columns = "indoor_outdoor")

# Extract whether someone consistently said they bouldered
boulders_bool_df = df[["max_boulder_grade", "max_boulder_grade_last_3_months", "consistently_send_boulder_grade"]] != "I don't boulder"
df["boulders"] = boulders_bool_df.all(axis="columns")
    
# Extract whether someone consistently said they route climbed
routes_bool_df = df[["max_route_grade", "max_route_grade_last_3_months", "consistently_send_route_grade"]] != "I don't climb routes"
route_climbs = routes_bool_df.all(axis="columns")
df["route_climbs"] = route_climbs

# other_training had open answers, we'll extract some of the most common ones into cardio and yoga
df['other_training'] = df['other_training'].str.lower()
df['other_training'] = df['other_training'].fillna("None")  # Assume those that didn't answer did no other activities
df['other_training'] = df['other_training'].replace("n/a", "None")
df['cardio'] = df['other_training'].str.contains("cardio|jogging|cycling|running|soccer|mountain biking|bike|badminton", regex=True, na=False)
df['yoga'] = df['other_training'].str.contains("yoga|stretch", regex=True, na=False)
df = df.drop(columns = "other_training")

# Typos and wrong units
df["height_cm"] = df["height_cm"].replace({1.67:167, 1.68:168}) # Answered in meters rather than cm.
df["arm_span_cm"] = df["arm_span_cm"].astype(float)
df["height_cm"] = df["height_cm"].replace(62, df.loc[df['height_cm'] == 62.0, 'arm_span_cm']) # Had reasonable arm span to fill data
df["height_cm"] = df["height_cm"].replace(1295, df["height_cm"].mean()) # Not sure what units this person was using. We'll fill the height with the mean.
df["arm_span_cm"] = df["arm_span_cm"].replace(1.68, 168) # Answered in meters

# Some people clearly measured arm span wrong (perhaps measuring one arm)
bool_list = abs(df["height_cm"].astype(float) - df["arm_span_cm"].astype(float)) > 50
df.loc[bool_list, "arm_span_cm"] = np.nan

# Some people definitely measured weight in pounds unfortunately. We'll try and correct for this.
df["weight_kg"] = df["weight_kg"].apply(lambda x: x if x <120 else x / 2.20462)

# Not interested in following variables, I want to predict max bouldering ability from training strategy, height and sex etc. factors.
df = df.drop(columns= ["timestamp", "max_route_grade", "max_route_grade_last_3_months", "consistently_send_route_grade",
                       "max_boulder_grade_last_3_months", "consistently_send_boulder_grade"])

In [7]:
df.to_csv("cleaned_data.csv", index = False)