In [2]:

# data manipulation
import pandas as pd
import numpy as np

# data viz
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.patheffects as path_effects
import seaborn as sns

import re,string

# apply some cool styling
plt.style.use("ggplot")
rcParams['figure.figsize'] = (12, 6)

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

In [4]:
#read in primary dataset and clean header names
data = pd.read_csv("../data/Supply_Chain_Shipment_Pricing_Dataset_20240302.csv",index_col="id")
data.columns = [i.strip().replace("/","").replace(" ","_").replace("#","num").replace("(","").replace(")","").replace("__","_").lower() for i in data.columns]

#reset index, old index was invalid
data = data.reset_index(drop=True)

# drop potentially duplicated rows
data = data.drop_duplicates()

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics_cols = data.select_dtypes(include=numerics).columns.tolist()

#replace invalid fields for PO and PQ dates
data['po_sent_to_vendor_date'] = data['po_sent_to_vendor_date'].replace('N/A - From RDC',np.nan).replace('Date Not Captured',np.nan)
data['pq_first_sent_to_client_date'] = data['pq_first_sent_to_client_date'].replace('Pre-PQ Process',np.nan).replace('Date Not Captured',np.nan)


In [5]:
#make copy
data_cleaned = data.copy()

In [6]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower()
    text=text.strip()
    text=re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    #text = re.sub(r'\[[0-9]*\]',' ',text)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    #text = re.sub(r'\d',' ',text)
    text = re.sub(r'\s+',' ',text)
    return text

data_cleaned['moleculetest_type_processed'] = data_cleaned['moleculetest_type'].apply(preprocess)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans, KMeans


vec = TfidfVectorizer(stop_words="english",ngram_range=(1,3))
vec.fit(data_cleaned['moleculetest_type_processed'])
features = vec.transform(data_cleaned['moleculetest_type_processed'])

clust = KMeans(init='k-means++',n_clusters=10,n_init=10)

clust.fit(features)

yhat=clust.predict(features)
data_cleaned['cluster_labels'] = clust.labels_

data_cleaned[['moleculetest_type','cluster_labels']]

for c in data_cleaned['cluster_labels'].unique():
    unique_mol_types = data_cleaned.loc[data_cleaned.cluster_labels == c][['moleculetest_type','cluster_labels']].drop_duplicates()
    display(unique_mol_types)

Unnamed: 0,moleculetest_type,cluster_labels
0,"HIV, Reveal G3 Rapid HIV-1 Antibody Test",4
2,"HIV 1/2, Determine Complete HIV Kit",4
11,"HIV 1/2, Uni-Gold HIV Kit",4
12,"HIV 1/2, InstantChek HIV 1+2 Kit",4
15,Didanosine,4
...,...,...
4838,"Malaria, Antigen P.f., HRP2 CareStart Kit",4
5135,"HIV 1/2, HEXAGON Rapid HIV Kit",4
6093,Quinine,4
6207,"Malaria Antigen P.f , HRP2, Kit",4


Unnamed: 0,moleculetest_type,cluster_labels
1,Nevirapine,5


Unnamed: 0,moleculetest_type,cluster_labels
3,Lamivudine,9


Unnamed: 0,moleculetest_type,cluster_labels
4,Stavudine,6
89,Lamivudine/Nevirapine/Stavudine,6
269,Lamivudine/Stavudine,6


Unnamed: 0,moleculetest_type,cluster_labels
5,Zidovudine,3
13,Lamivudine/Zidovudine,3
401,Abacavir/Lamivudine/Zidovudine,3
786,Lamivudine/Zidovudine+Nevirapine,3
4613,Lamivudine/Zidovudine+Efavirenz,3


Unnamed: 0,moleculetest_type,cluster_labels
6,Efavirenz,0


Unnamed: 0,moleculetest_type,cluster_labels
9,Lopinavir/Ritonavir,7
113,Ritonavir,7


Unnamed: 0,moleculetest_type,cluster_labels
33,Tenofovir Disoproxil Fumarate,2
73,Efavirenz/Emtricitabine/Tenofovir Disoproxil F...,2
115,Emtricitabine/Tenofovir Disoproxil Fumarate,2
1586,Lamivudine/Tenofovir Disoproxil Fumarate,2
2695,Efavirenz/Lamivudine/Tenofovir Disoproxil Fuma...,2


Unnamed: 0,moleculetest_type,cluster_labels
62,Abacavir,8
2870,Abacavir/Lamivudine,8


Unnamed: 0,moleculetest_type,cluster_labels
123,Lamivudine/Nevirapine/Zidovudine,1
