In [1]:
import os
import re

import pandas as pd

In [2]:
data_dir = os.path.join("..", "..", "raw-data", "clinical", "grading.xlsx")

os.path.isfile(data_dir)

True

In [3]:
pd.ExcelFile(data_dir).sheet_names

['Sheet1', 'Sheet2']

In [4]:
def filter_ids(row):
    variant = row["Unnamed: 1"]

    id = row["S/N"] + variant if isinstance(variant, str) else row["S/N"]

    return id

In [5]:
df = pd.read_excel(data_dir, sheet_name='Sheet1')
df["S/N"] = df["S/N"].fillna(method="ffill")
df["S/N"] = df["S/N"].map(int).map(str)
df["id"] = df.apply(filter_ids, axis=1)

df = df.drop(["S/N", "Unnamed: 1"], axis=1)

df.head()

  df["S/N"] = df["S/N"].fillna(method="ffill")


Unnamed: 0,Diagnosis,Microscopic Description,id
0,"\nDura/ brain, left frontal convexity tumour, ...",MICROSCOPIC DESCRIPTION_x000D__x000D_Section s...,16425
1,"Dura/ brain, olfactory groove tumour, excision...",MICROSCOPIC DESCRIPTION_x000D__x000D_Sections ...,16421
2,"\n(A, B) Dura/ leptomeninges, right orbital an...","MICROSCOPIC DESCRIPTION_x000D__x000D_(A, B) Se...",16223
3,"(A-B) Meninges/ brain, left frontotemporal tum...",MICROSCOPIC DESCRIPTION_x000D__x000D_(A-B) Per...,16089
4,"_x000D_\n(A) Anterior skull base tumor, excisi...",MICROSCOPIC DESCRIPTION_x000D__x000D_(A) Secti...,16026


In [6]:
len(df)

132

In [6]:
mapping = {
    "I": "1",
    "II": "2",
    "III": "3",
    "IV": "4",
    "1": "1",
    "2": "2",
    "3": "3",
    "4": "4"
}

In [7]:
pattern = r"grade\s*(I{1,4}|1|2|3|4)"

df["grade"] = df["Diagnosis"].str.extract(pattern, flags=re.IGNORECASE)
df["grade"] = df["grade"].map(mapping)

df.head()

Unnamed: 0,Diagnosis,Microscopic Description,id,grade
0,"\nDura/ brain, left frontal convexity tumour, ...",MICROSCOPIC DESCRIPTION_x000D__x000D_Section s...,16425,1
1,"Dura/ brain, olfactory groove tumour, excision...",MICROSCOPIC DESCRIPTION_x000D__x000D_Sections ...,16421,1
2,"\n(A, B) Dura/ leptomeninges, right orbital an...","MICROSCOPIC DESCRIPTION_x000D__x000D_(A, B) Se...",16223,1
3,"(A-B) Meninges/ brain, left frontotemporal tum...",MICROSCOPIC DESCRIPTION_x000D__x000D_(A-B) Per...,16089,1
4,"_x000D_\n(A) Anterior skull base tumor, excisi...",MICROSCOPIC DESCRIPTION_x000D__x000D_(A) Secti...,16026,1


In [8]:
df[df["grade"].isna()]

Unnamed: 0,Diagnosis,Microscopic Description,id,grade
21,"_x000D_\n(A) Brain, frontal lobe, tumour; exci...",MICROSCOPIC DESCRIPTION_x000D__x000D_(A) Secti...,14572B,
67,_x000D_\n(A and B) Right parietal tumour: Meni...,MICROSCOPIC DESCRIPTION_x000D__x000D_Sections ...,12230,
75,"\nCP angle brain tumor, biopsy: Favor meningio...",MICROSCOPIC DESCRIPTION_x000D__x000D_Sections ...,12050,
78,"_x000D_\nLeft parietal tumour, excision: _x000...",MICROSCOPIC DESCRIPTION_x000D__x000D_Sections ...,12010B,
87,"Brain, cavernous sinus mass, biopsy: - Mening...",,11785A,
88,"Cavernous sinus, excision biopsies : Meningioma",MICROSCOPIC DESCRIPTION_x000D__x000D_Sections ...,11785B,


In [9]:
cleaned = df[["id", "grade"]].copy().dropna()

cleaned.head()

Unnamed: 0,id,grade
0,16425,1
1,16421,1
2,16223,1
3,16089,1
4,16026,1


In [10]:
cleaned["grade"].value_counts()

grade
1    79
2    43
3     4
Name: count, dtype: int64

In [11]:
cleaned["grade"] = cleaned["grade"].map(lambda x: x if x == "1" else "2+")

cleaned["grade"].value_counts()

grade
1     79
2+    47
Name: count, dtype: int64

In [12]:
dest_dir = os.path.join("..", "data")
os.makedirs(dest_dir, exist_ok=True)

cleaned.to_csv(os.path.join(dest_dir, "labels.csv"), index=False)