# SYSTEME DE RECOMMENDATION DE MEDICAMMENTS PAR CLUSTERRING
*Réalisé par Théo EWBANK, Corentin CUI, Khadidiatou DIOKH, Destinée MOUELY.*

Etudiants ING3 DS - Année 2024-2025

# Imports

In [59]:
import re
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, silhouette_score

df_name = "/content/medical_data.csv"

In [4]:
df_raw = pd.read_csv(df_name)
print(df_raw.head())
print(df_raw.columns)

          Name DateOfBirth  Gender             Symptoms               Causes  \
0     John Doe  15-05-1980    Male         Fever, Cough      Viral Infection   
1   Jane Smith  10-08-1992  Female    Headache, Fatigue               Stress   
2  Michael Lee  20-02-1975    Male  Shortness of breath            Pollution   
3   Emily Chen  03-11-1988  Female     Nausea, Vomiting       Food Poisoning   
4    Alex Wong  12-06-2001    Male          Sore Throat  Bacterial Infection   

           Disease           Medicine  
0      Common Cold    Ibuprofen, Rest  
1         Migraine        Sumatriptan  
2           Asthma  Albuterol Inhaler  
3  Gastroenteritis   Oral Rehydration  
4     Strep Throat         Penicillin  
Index(['Name', 'DateOfBirth', 'Gender', 'Symptoms', 'Causes', 'Disease',
       'Medicine'],
      dtype='object')


In [58]:
#focntios essentielles
def train_binarizers(X, mlbs):
    mlbs[0].fit(X['Symptoms'])
    mlbs[1].fit(X['Disease'])
    mlbs[2].fit(X['Causes'])
    return mlbs

def encode_features(X, mlbs):
    X_symptoms = mlbs[0].transform(X['Symptoms'])
    X_diseases = mlbs[1].transform(X['Disease'])
    X_causes = mlbs[2].transform(X['Causes'])
    X_transformed = np.hstack((X_symptoms, X_diseases, X_causes))
    return X_transformed

# Data Cleaning & Feature creation

In [5]:
data_for_colors = df_raw.isna().astype(int)

# Define colors: purple for valid cells, yellow for NaN
colorscale = [
    [0, 'purple'],  # 0 corresponds to valid cells
    [1, 'yellow']   # 1 corresponds to NaN cells
]
# Create the heatmap
fig = go.Figure(data=go.Heatmap(
    z=data_for_colors.values,
    x=df_raw.columns,
    y=df_raw.index,
    colorscale=colorscale,
    showscale=False  # Hide the color scale
))
fig.show()

In [6]:
df = df_raw.copy()
df.drop(columns=['Name'], inplace=True)
df.dropna(thresh=4, inplace=True) # On supprime les colonnes qui n'ont pas assez d'info

######## 1. AGE
df['DateOfBirth'] = pd.to_datetime(df['DateOfBirth'], errors='coerce', dayfirst=True)
df['Age'] = (2025 - df['DateOfBirth'].dt.year)
mean_age = df['Age'].mean() # On complete les ages NA avec la moyenne
df['Age'] = df['Age'].fillna(mean_age).astype(int)

df['is_young'] = [1 if age < 25 else 0 for age in df['Age']] # On crée des features one-hot
df['is_old'] = [1 if age > 45 else 0 for age in df['Age']]
df['is_middleaged'] = [1 if age >= 25 and age <= 45 else 0 for age in df['Age']]

######## 2. GENDER
df['Gender'] = df['Gender'].replace({
    'Femal': 'Female'  # On standardise la colonne 'Gender'
})

######## 3. SYMPOMS, MEDICINE, CAUSES, DISEASE
re1 = r"shortness(\s*of\s*brea(th|t)?)?"
re2 = r'[a-z_-]*covid(-19)?_exposure[a-z_-]*'

def process_row(row):
    row = row.lower().split(',')
    row = [re.sub(re1, 'shortness_of_breath', val) for val in row]
    row = [val.replace(' ', '_') for val in row]
    row = [re.sub(r'^e_', '', val) for val in row]
    row = [val.lstrip('_') for val in row]
    row = [val.rstrip('_') for val in row]
    row = [val.rstrip('_o') for val in row]
    row = [val.replace('on___', '') for val in row]
    row = [re.sub(re2, 'covid_exposure', val) for val in row]
    row = [re.sub(r'rheumatoid_arthrit(i|is)?', 'rheumatoid_arthritis', val) for val in row]
    row = [val.replace('chronic_fatiguesyndrome', 'chronic_fatigue_syndrome') for val in row]
    return row

df['Symptoms'] = df['Symptoms'].apply(lambda x: process_row(x))
df['Medicine'] = df['Medicine'].apply(lambda x: process_row(x))
df['Causes'] = df['Causes'].apply(lambda x: process_row(x))
df['Disease'] = df['Disease'].apply(lambda x: [x])

def set_uncommon_to_other(series):
    value_counts = series.explode().value_counts()
    uncommon_values = value_counts[value_counts < 6].index.tolist()
    series = series.apply(lambda x: ['other' if val in uncommon_values else val for val in x])
    return series

def set_rare_diseases_to_other(series):
    value_counts = series.value_counts()
    uncommon_values = value_counts[value_counts < 4].index.tolist()
    series = series.apply(lambda x: 'other' if x in uncommon_values else x)
    return series

df[['Symptoms','Causes','Medicine','Disease']] = df[['Symptoms','Causes','Medicine','Disease']].apply(lambda x: set_uncommon_to_other(x), axis=0)
#df[['Disease']] = df[['Disease']].apply(lambda x: set_rare_diseases_to_other(x), axis=0)

# EDA

In [None]:
# Analyse des fréquences
top_diseases = df['Disease'].loc[lambda x: x != 'other'].value_counts().head(10)
top_symptoms = df['Symptoms'].explode().loc[lambda x: x != 'other'].value_counts().head(10)
top_medicines = df['Medicine'].explode().loc[lambda x: x != 'other'].value_counts().head(10)
gender_distribution = df['Gender'].value_counts().head(10)

# Crosstabs
gender_disease = pd.crosstab(df['Gender'], df['Disease'])

age_disease_table = df.groupby('Disease')[['is_young', 'is_middleaged', 'is_old']].sum()
age_disease_table.columns = ['Young', 'Middle-aged', 'Old']

# Create a subplot figure
fig = make_subplots(
    rows=3, cols=2,
    subplot_titles=(
        "Top 10 maladies les plus fréquentes",
        "Top 10 symptômes les plus fréquents",
        "Top 10 médicaments les plus prescrits",
        "Répartition par sexe",
        "Sexe vs Maladies principales",
        "Groupe d’âge vs Maladies principales"
    ),
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "pie"}],
           [{"type": "heatmap"}, {"type": "heatmap"}]]
)

fig.add_trace(go.Bar(x=top_diseases.values, y=top_diseases.index, orientation='h', marker=dict(color='blue')), row=1, col=1)
fig.add_trace(go.Bar(x=top_symptoms.values, y=top_symptoms.index, orientation='h', marker=dict(color='red')), row=1, col=2)
fig.add_trace(go.Bar(x=top_medicines.values, y=top_medicines.index, orientation='h', marker=dict(color='green')), row=2, col=1)
fig.add_trace(go.Pie(text=gender_distribution.index, values=gender_distribution.values, hole=0.4), row=2, col=2)
fig.add_trace(go.Heatmap(
        z=gender_disease.loc[:, gender_disease.sum().sort_values(ascending=False).head(5).index].values,
        x=gender_disease.columns[:5],
        y=gender_disease.index,
        colorscale=px.colors.sequential.Plasma,
        showscale=False, text=gender_disease.loc[:, gender_disease.sum().sort_values(ascending=False).head(5).index].values,
        texttemplate="%{text}", hovertemplate=None), row=3, col=1)

# 6. Groupe d’âge vs maladies principales
fig.add_trace(go.Heatmap(
        z=age_disease_table.values,
        x=age_disease_table.columns,
        y=age_disease_table.index,
        colorscale=px.colors.sequential.Plasma,
        showscale=False, text=age_disease_table.values,
        texttemplate="%{text}", hovertemplate=None), row=3, col=2)

# Update layout
fig.update_layout(
    height=900, width=1200,
    title_text="Visualisation des données",
    showlegend=False
)

fig.show()

# Kmeans

In [41]:
#Define functions
def train_kmeans(X, nb_clusters, mlbs):
    mlbs = train_binarizers(X, mlbs)
    X_transformed = encode_features(X, mlbs)
    kmeans = KMeans(n_clusters=nb_clusters, random_state=113)
    kmeans.fit(X_transformed)
    clusters = kmeans.labels_
    return kmeans, mlbs, clusters

def infer_kmeans(X, kmeans, mlbs):
    X_transformed = encode_features(X, mlbs)
    clusters = kmeans.predict(X_transformed)
    return clusters

def score_kmeans(nb_clusters, X_train, X_test, mlbs):
    kmeans, mlbs, clusters_train = train_kmeans(X_train, nb_clusters, mlbs)
    clusters_test = infer_kmeans(X_test, kmeans, mlbs)
    score_train = silhouette_score(encode_features(X_train, mlbs), clusters_train)
    score_test = silhouette_score(encode_features(X_test, mlbs), clusters_test)
    scores = [score_train, score_test]
    return scores

In [43]:
#Initialisation et recherche du meilleur nb de clusters
X_train, X_test = train_test_split(df[['Symptoms', 'Disease', 'Causes', 'Medicine']],
                                   test_size=0.2, random_state=113)
mlb_disease = MultiLabelBinarizer()
mlb_symptoms = MultiLabelBinarizer()
mlb_causes = MultiLabelBinarizer()
mlbs = [mlb_disease, mlb_symptoms, mlb_causes]

scores = {}
for i in range(2, 20):
    scores[i] = score_kmeans(i, X_train, X_test, mlbs)
print(scores)

kmeans, mlbs, clusters_train = train_kmeans(X_train, 15, mlbs)
clusters_test = infer_kmeans(X_test, kmeans, mlbs)

{2: [np.float64(0.11877327036291335), np.float64(0.1165030655380192)], 3: [np.float64(0.14657375103070755), np.float64(0.160328507989064)], 4: [np.float64(0.19573892881869356), np.float64(0.19186502914932874)], 5: [np.float64(0.23323481015007919), np.float64(0.2075622689727798)], 6: [np.float64(0.270740549803389), np.float64(0.2640492249443905)], 7: [np.float64(0.28232179922119954), np.float64(0.17426710194510558)], 8: [np.float64(0.32007310326681315), np.float64(0.20565804037161187)], 9: [np.float64(0.36190789999806433), np.float64(0.2207109042470533)], 10: [np.float64(0.4036817721051415), np.float64(0.21895381944645798)], 11: [np.float64(0.42341065600756905), np.float64(0.29942581906326626)], 12: [np.float64(0.4568453888581027), np.float64(0.3464054876941432)], 13: [np.float64(0.4969831257867126), np.float64(0.40352624451705055)], 14: [np.float64(0.5334882459680306), np.float64(0.47745657834543326)], 15: [np.float64(0.5656284394953541), np.float64(0.4826981190967035)], 16: [np.float6

In [33]:
#Extraction du meilleur médicamment pour chaque cluster
X_train['Cluster'] = clusters_train

cluster_top_medecine = {}

for cluster in range(15):
    cluster_df = X_train[X_train['Cluster'] == cluster]
    nb_patients = len(cluster_df)
    top_medicine = cluster_df['Medicine'].explode().value_counts().head(1).index[0]
    cluster_top_medecine[cluster] = top_medicine
    print(f"Cluster {cluster}: {top_medicine} - {nb_patients} patients")

Cluster 0: nsaids - 13 patients
Cluster 1: other - 18 patients
Cluster 2: dimenhydrinate - 11 patients
Cluster 3: oral_rehydration - 18 patients
Cluster 4: other - 38 patients
Cluster 5: antibiotics - 10 patients
Cluster 6: rest - 13 patients
Cluster 7: antihistamine - 10 patients
Cluster 8: rest - 8 patients
Cluster 9: pain_relievers - 8 patients
Cluster 10: rest - 5 patients
Cluster 11: therapy - 12 patients
Cluster 12: rest - 14 patients
Cluster 13: eye_exercises - 7 patients
Cluster 14: antacids - 7 patients


In [34]:
#Evaluation sur le test set
X_test['Cluster'] = clusters_test
X_test['Cluster_medicine'] = X_test['Cluster'].map(cluster_top_medecine)

def calculate_precision_recall(row):
    value = 'No'
    for i in range(len(row['Medicine'])):
      if row['Medicine'][i] == row['Cluster_medicine']:
        value = 'Yes'
    return value

X_test['Predicted_Medicine_Is_Correct'] = X_test.apply(calculate_precision_recall, axis=1)
X_test['Predicted_Medicine_Is_Correct'].value_counts()

Unnamed: 0_level_0,count
Predicted_Medicine_Is_Correct,Unnamed: 1_level_1
Yes,42
No,7


In [37]:
#Recommandation d'un médicament à un nouveau patient, basé sur son cluster
new_patient_symptoms = ['headache', 'fever', 'cough']
new_patient_disease = ['COVID-19']
new_patient_causes = ['infection']

new_X = pd.DataFrame({'Symptoms': [new_patient_symptoms],
                      'Disease': [new_patient_disease],
                      'Causes': [new_patient_causes]})

cluster_new = infer_kmeans(new_X, kmeans, mlbs)
print("\nLe nouveau patient est le plus semblable au cluster", cluster_new[0])


Le nouveau patient est le plus semblable au cluster 5


In [39]:
#Affichage des détails d'un cluster en particulier
def show_cluster_features(df, cluster_id, top_n=5):
    cluster_df = df[df['Cluster'] == cluster_id]

    top_symptoms = cluster_df.explode('Symptoms')['Symptoms'].value_counts().head(top_n)
    top_diseases = cluster_df.explode('Disease')['Disease'].value_counts().head(top_n)

    print(f" Cluster {cluster_id} Summary:")
    print("\n Top Symptoms:")
    print(top_symptoms)
    print("\n Top Diseases:")
    print(top_diseases)
    print(f"\n Total Patients in Cluster: {len(cluster_df)}")

show_cluster_features(X_train, cluster_id=1)

 Cluster 1 Summary:

 Top Symptoms:
Symptoms
other       18
fatigue      6
nausea       4
itching      4
headache     3
Name: count, dtype: int64

 Top Diseases:
Disease
other    18
Name: count, dtype: int64

 Total Patients in Cluster: 18


# KNN

In [68]:
def train_knn(X, y, nb_neig, encoders):
    encoders = train_binarizers(X, encoders)
    encoders[3].fit(y)
    X_transformed = encode_features(X, encoders)
    y_transformed = encoders[3].transform(y)
    knn = KNeighborsClassifier(n_neighbors=nb_neig, metric='cosine')
    knn.fit(X_transformed, y_transformed)
    return knn, encoders

def infer_knn(X, y, knn, encoders):
    X_transformed = encode_features(X, encoders)
    y_transformed = encoders[3].transform(y)
    predicted_medicines = knn.predict(X_transformed)
    return predicted_medicines

def score_knn(nb_neig, data, encoders):
    knn, encoders = train_knn(data[0], data[1], nb_neig, encoders)
    predicted_medicines = infer_knn(data[2], data[3], knn, encoders)
    X_trans_train = encode_features(data[0], encoders)
    y_trans_train = encoders[3].transform(data[1])
    X_trans_test = encode_features(data[2], encoders)
    y_trans_test = encoders[3].transform(data[3])
    score_train = knn.score(X_trans_train, y_trans_train)
    score_test = knn.score(X_trans_test, y_trans_test)
    scores = [score_train, score_test]
    return scores

In [82]:
raw_X = df[['Symptoms', 'Disease', 'Causes', 'Medicine']]
raw_y = df['Medicine'].apply(lambda x: x[0])
data = {}
data[0], data[2], data[1], data[3] = train_test_split( raw_X, raw_y,
                                   test_size=0.2, random_state=113)
mlb_disease = MultiLabelBinarizer()
mlb_symptoms = MultiLabelBinarizer()
mlb_causes = MultiLabelBinarizer()
y_le = LabelEncoder()
encoders = [mlb_disease, mlb_symptoms, mlb_causes, y_le]

scores = {}
for i in range(2, 5):
    scores[i] = score_knn(i, data, encoders)
print(scores)

knn, encoders = train_knn(data[0], data[1], 2, encoders)
predicted_ids = infer_knn(data[2], data[3], knn, encoders)
predicted_medicines = encoders[3].inverse_transform(predicted_ids)

{2: [0.9635416666666666, 0.9183673469387755], 3: [0.9583333333333334, 0.9183673469387755], 4: [0.9479166666666666, 0.9183673469387755]}


In [83]:
predicted_medicines

array(['nsaids', 'other', 'other', 'other', 'therapy', 'isolation',
       'rest', 'antacids', 'fluids', 'other', 'rest', 'other',
       'antibiotics', 'fluids', 'isolation', 'nsaids', 'nsaids', 'other',
       'oral_rehydration', 'rest', 'oral_rehydration', 'rest', 'other',
       'rest', 'rest', 'rest', 'other', 'rest', 'antibiotics',
       'isolation', 'antihistamine', 'pain_relievers', 'pain_relievers',
       'antihistamine', 'pain_relievers', 'rest', 'nsaids', 'fluids',
       'therapy', 'rest', 'other', 'other', 'fluids', 'antibiotics',
       'rest', 'isolation', 'other', 'other', 'other'], dtype=object)