# Boulder recommendation  
## Similarity calculation taking into account ascents and boulder features (grade, styles)  
Ascents: Jaccard similarity  
Grades: Cosine similarity  
Styles: Dot product normalized with the highest style sharing count 

# Status: Recommendation system satifying, can be put in production

## SQLAlchemy session creation

In [1]:
import numpy as np
import pandas as pd

from sqlalchemy.orm import Session
from sqlalchemy import create_engine

DB_URL = "sqlite:///../../bleau_info-17-09-2025.db"

engine = create_engine(DB_URL, echo=False)

session = Session(engine)

In [2]:
import sys

sys.path.append("../../")

## Similarity matrix training **based on similar repetitors**

### Database Query

In [None]:
from sqlalchemy import select
from models.ascent import Ascent


ascents = session.execute(
    select(Ascent.user_id, Ascent.boulder_id)
).all()
ascents_df = pd.DataFrame(data=ascents, columns=["user_id", "id"])

OperationalError: (sqlite3.OperationalError) no such table: boulder_repetition
[SQL: SELECT boulder_repetition.user_id, boulder_repetition.boulder_id 
FROM boulder_repetition]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

### boulder_user matrix (Pivot table)

In [None]:
boulder_user_matrix = ascents_df.pivot_table(
    index="id",
    columns="user_id",
    aggfunc="size",
    fill_value=0,
    dropna=True,
)
# boulder_user_matrix = boulder_user_matrix[boulder_user_matrix.index < 20]
boulder_ids = boulder_user_matrix.index

In [None]:
# display(boulder_user_matrix)

user_id,1,2,3,4,5,6,7,8,9,10,...,7361,7362,7363,7364,7365,7366,7367,7368,7369,7371
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40231,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40232,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40233,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40234,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Conversion to sparse matrix

In [None]:
from scipy.sparse import coo_matrix

boulder_user_matrix = coo_matrix(boulder_user_matrix)
print(boulder_user_matrix)

<COOrdinate sparse matrix of dtype 'int64'
	with 520448 stored elements and shape (27665, 6853)>
  Coords	Values
  (0, 2)	1
  (1, 3)	1
  (1, 4)	1
  (1, 5)	1
  (1, 6)	1
  (2, 9)	1
  (2, 10)	1
  (3, 11)	1
  (3, 12)	1
  (3, 13)	1
  (3, 14)	1
  (3, 15)	1
  (3, 16)	1
  (4, 7)	1
  (4, 18)	1
  (4, 19)	1
  (5, 2)	1
  (6, 2)	1
  (6, 11)	1
  (6, 13)	1
  (6, 14)	1
  (6, 15)	1
  (6, 20)	1
  (6, 21)	1
  (6, 22)	1
  :	:
  (27654, 1284)	1
  (27655, 65)	1
  (27656, 65)	1
  (27656, 366)	1
  (27656, 961)	1
  (27656, 1171)	1
  (27656, 1284)	1
  (27656, 3175)	1
  (27656, 3820)	1
  (27656, 5378)	1
  (27657, 65)	1
  (27657, 1284)	1
  (27658, 65)	1
  (27658, 1284)	1
  (27659, 65)	1
  (27659, 1171)	1
  (27659, 1284)	1
  (27660, 65)	1
  (27660, 1284)	1
  (27661, 1284)	1
  (27662, 65)	1
  (27662, 1284)	1
  (27663, 1284)	1
  (27664, 65)	1
  (27664, 1284)	1


### Ascents similarity training

In [None]:
def jaccard_pairwise_similarity(X):
    # CSR matrix storing the number of shared ascents for each pair of
    # boulders sharing at least one ascent

    intersection = X @ X.T

    # 1D array storing the total number of ascent for each boulder
    row_sums = np.asarray(X.sum(axis=1)).ravel()

    # intersection decomposition for calculation on 1D arrays
    rows, cols = intersection.nonzero()
    intersection_data = intersection.data

    union = row_sums[rows] + row_sums[cols] - intersection_data

    jaccard = intersection_data / union

    # Index remapping based on the boulder ids
    new_rows = boulder_ids[rows]
    new_cols = boulder_ids[cols]

    return coo_matrix(
        (jaccard, (new_rows, new_cols)),
        dtype=np.float32,
    )

similarity_ascents = jaccard_pairwise_similarity(boulder_user_matrix)

sparsity = 1.0 - similarity_ascents.nnz / (
    similarity_ascents.shape[0] * similarity_ascents.shape[1]
)

print(f"Sparsity: {sparsity:.2%}")

Sparsity: 90.61%


In [None]:
# similarity_ascents_df = pd.DataFrame(similarity_ascents.toarray())
# display(similarity_ascents_df)

## Similarity matrix training **based on similar features**

### Database query and dataframe creation

In [None]:
from sqlalchemy.orm import joinedload
from models.boulder import Boulder

# Database request
boulders = (
    session.scalars(
        select(Boulder).options(
            joinedload(Boulder.grade), joinedload(Boulder.styles)
        )
    )
    .unique()
    .all()
)

# Data extraction
boulders = [
    {
        "id": boulder.id,
        "grade": boulder.grade.correspondence,
        "styles": [style.style for style in boulder.styles],
    }
    for boulder in boulders
]

# Dataframe setup
boulders_df = pd.DataFrame(boulders)
# boulders_df = boulders_df[boulders_df.id < 20]
boulder_ids = boulders_df.id
boulders_df.head()


Unnamed: 0,id,grade,styles
0,1,32,"[dévers, départ assis]"
1,2,0,[mur]
2,3,30,"[mur, aplats, réglettes]"
3,4,30,"[dévers, départ assis, traversée]"
4,5,29,"[boucle, surplomb]"


### Style

#### Binarizing

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
styles = mlb.fit_transform(boulders_df.styles)

# styles_df = pd.DataFrame(styles, columns=mlb.classes_)
# display(styles_df)

#### Conversion to sparse matrix

In [None]:
styles = coo_matrix(styles, dtype=np.float32)

#### Style similarity training

In [None]:
# Dot product
similarity_style = (styles @ styles.T).tocoo()

# Normalization
off_diag_max = similarity_style.data[
    similarity_style.row != similarity_style.col
].max()
similarity_style.data /= off_diag_max

# Re-indexing tp match database
new_shape = (similarity_style.shape[0] + 1, similarity_style.shape[1] + 1)
similarity_style = coo_matrix(
    (
        similarity_style.data,
        (similarity_style.row + 1, similarity_style.col + 1),
    ),
    shape=new_shape,
)


# Sparcity
sparsity = 1.0 - similarity_style.nnz / (
    similarity_style.shape[0] * similarity_style.shape[1]
)
print(f"Sparsity: {sparsity:.2%}")

# Dataframe tranformation
# similarity_style_df = pd.DataFrame(similarity_style.toarray())
# display(similarity_style_df)

Sparsity: 76.70%


### Grade

#### Fuzzy one-hot grade vector

In [None]:
max_grade = boulders_df.grade.max()
grade_df = pd.get_dummies(boulders_df.grade, dtype=np.float32)
grade_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def grade_update(row, max_grade):
    grade_index = row.idxmax()

    if grade_index == 0:
        return row

    values = np.array([0.5, 0.5], dtype=np.float32)
    offsets = np.array([-1, 1])

    for offset, value in zip(offsets, values):
        current_column = grade_index + offset
        if 0 < current_column <= max_grade:
            row[current_column] = value
    return row

grade_df = grade_df.apply(lambda row: grade_update(row, max_grade=max_grade), axis=1)
grade_df.fillna(0, inplace=True)
display(grade_df)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,1.0,0.5,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Conversion to sparse matrix

In [None]:
grade = coo_matrix(grade_df)

#### Grade similarity training

In [None]:
from time import perf_counter
from sklearn.metrics.pairwise import cosine_similarity

# Cosine training
start = perf_counter()
similarity_grade = cosine_similarity(grade)
end = perf_counter()
print(f"Cosine calculation time: {end - start:.4f}")


# Re-indexing to match database
start = perf_counter()
coo = coo_matrix(similarity_grade)
end = perf_counter()
print(f"COO conversion time: {end - start:.4f}")

new_shape = (coo.shape[0] + 1, coo.shape[1] + 1)

similarity_grade = coo_matrix(
    (
        coo.data,
        (coo.row + 1, coo.col + 1),
    ),
    shape=new_shape,
)

# Sparcity
sparsity = 1 - similarity_grade.nnz / (
    similarity_grade.shape[0] * similarity_grade.shape[1]
)
print(f"Sparcity: {sparsity:.2f}")

# similarity_grade_df = pd.DataFrame(similarity_grade.toarray())
# display(similarity_grade_df)

Cosine calculation time: 10.6748
COO conversion time: 46.0865
Sparcity: 0.80


## Coherence check

In [None]:
print(type(similarity_ascents))
print(type(similarity_style))
print(type(similarity_grade))

print(similarity_ascents.shape)
print(similarity_style.shape)
print(similarity_grade.shape)

print(similarity_ascents.nnz)
print(similarity_style.nnz)
print(similarity_grade.nnz)

print(similarity_ascents.dtype)
print(similarity_style.dtype)
print(similarity_grade.dtype)

<class 'scipy.sparse._coo.coo_matrix'>
<class 'scipy.sparse._coo.coo_matrix'>
<class 'scipy.sparse._coo.coo_matrix'>
(40236, 40236)
(40236, 40236)
(40236, 40236)
151977473
377238439
331867373
float32
float32
float32


## Data cleaning
Removing all values in similarity_grade < 0.5 (grade difference >= 2)  
Removing of all non zero values in similarity_ascents and similarity_style where similarity_grade == 0

In [None]:
similarity_grade_cleaned = similarity_grade.copy()
similarity_grade_cleaned.data[similarity_grade_cleaned.data < 0.5] = 0
similarity_grade_cleaned.eliminate_zeros()

In [None]:
similarity_grade_cleaned = similarity_grade_cleaned.tocsr()

In [None]:
from scipy.sparse import csr_matrix

def matrix_cleaning(cleaning_matrix: csr_matrix, matrix_to_clean: coo_matrix):
    """Remove all values in matrix_to_clean that are not indexed in cleaning_matrix
    
    :parameters:
    cleaning_matrix: CSR matrix that serves as reference for data existence
    matrix_to_clean: COO matrix from which some indexes are removed"""
    
    # Boolean mask creation (1D array) - Fancy indexing converted to boolean
    # Check if cleaning_matrix contains the indexes of matrix_to_clean
    mask = cleaning_matrix[matrix_to_clean.row, matrix_to_clean.col].A1 != 0

    # Fancy indexing to remove values from matrix_to_clean that are equal to 0 
    # in cleaning_matrix
    new_rows = matrix_to_clean.row[mask]
    new_cols = matrix_to_clean.col[mask]
    new_data = matrix_to_clean.data[mask]

    return csr_matrix(
        (new_data, (new_rows, new_cols)), shape=matrix_to_clean.shape
    )

similarity_ascents_cleaned = matrix_cleaning(similarity_grade_cleaned, similarity_ascents)
similarity_style_cleaned = matrix_cleaning(similarity_grade_cleaned, similarity_style)

In [None]:
print(
    1
    - similarity_ascents.nnz
    / (similarity_ascents.shape[0] * similarity_ascents.shape[1])
)
print(
    1
    - similarity_ascents_cleaned.nnz
    / (
        similarity_ascents_cleaned.shape[0]
        * similarity_ascents_cleaned.shape[1]
    )
)
print(
    1
    - similarity_style.nnz
    / (similarity_style.shape[0] * similarity_style.shape[1])
)
print(
    1
    - similarity_style_cleaned.nnz
    / (similarity_style_cleaned.shape[0] * similarity_style_cleaned.shape[1])
)
print(
    1
    - similarity_grade.nnz
    / (similarity_grade.shape[0] * similarity_grade.shape[1])
)
print(
    1
    - similarity_grade_cleaned.nnz
    / (similarity_grade_cleaned.shape[0] * similarity_grade_cleaned.shape[1])
)

0.906125071319695
0.9780836983904517
0.7669836795049579
0.9606686373292495
0.7950089223309089
0.8766977332742684


## Matrix saving

In [None]:
from scipy.sparse import save_npz

save_npz("similarity_ascent.npz", similarity_ascents_cleaned)
save_npz("similarity_style.npz", similarity_style_cleaned)
save_npz("similarity_grade.npz", similarity_grade_cleaned)

## Recommendation example


In [None]:
def recommend_boulders(
    input_boulders, top_n=5, alpha=0.5, beta=0.25, gamma=0.25
):

    ascents = similarity_ascents_cleaned[:, input_boulders].sum(axis=1).A1
    style = similarity_style_cleaned[:, input_boulders].sum(axis=1).A1
    grade = similarity_grade_cleaned[:, input_boulders].sum(axis=1).A1

    ascents[input_boulders] = 0
    style[input_boulders] = 0
    grade[input_boulders] = 0

    sim_scores = alpha * ascents + beta * style + gamma * grade
    
    best_boulders = np.argsort(-sim_scores)[:top_n]
    
    return best_boulders.tolist()


recommendations = recommend_boulders([6735], top_n=10)
print(recommendations)

[23530, 3820, 25502, 28315, 3639, 26303, 14233, 11651, 13458, 17857]
