In [8]:
"""
this file clusters device and manufacturer cols within the desried table.

prereq:
pip install scikit-learn nltk matplotlib cleanco levenshtein name_matching seaborn
"""
import time
from get_data import get_dfs
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import re
import unicodedata
from cleanco import basename
import time
import Levenshtein
from name_matching.name_matcher import NameMatcher
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage

nltk.download('stopwords')
nltk.download('punkt_tab')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prpar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prpar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
# read dfs
df_recall, df_device_event, df_device = get_dfs()

loading table: recall....


  df = pd.read_sql_query(f'select * from {table}', conn)


                                     id cfres_id k_number pma_number  \
0  400cd411-6ba1-4360-9fea-b55e13468d00   212115     None       None   
1  1d35718a-6bc8-4706-bd3d-31741cd98324   212318     None       None   
2  224a2ab8-97ef-4799-879e-6083898ab2e7   211119     None       None   
3  b264b469-7c7f-48d5-bea0-57f38377569a   208478     None       None   
4  0aad8582-1a51-4374-998c-50cc0f4b4466   207500     None       None   

  event_date_initiated event_date_created event_date_posted  \
0           2025-01-15               None        2025-02-05   
1           2025-01-28               None        2025-02-03   
2           2024-08-08               None        2024-12-03   
3           2024-03-28               None        2024-07-01   
4           2024-04-16               None        2024-05-30   

  event_date_terminated     recall_status                   recalling_firm  \
0                  None  Open, Classified        Philips North America Llc   
1                  None  Open, C

  df = pd.read_sql_query(f'select * from {table}', conn)


                               event_id adverse_event_flag  \
0  137fc417-abd2-4c92-af98-f816d9d1b01c                  N   
1  7d7e75df-69d0-4344-a609-d8294fcaf9e1                  N   
2  b112bfb0-4306-498a-8ab9-76076641f4f9                  N   
3  ea373448-a219-4494-9cb6-2b3fbb403c8f                  N   
4  52eeaa61-643d-4452-aaa6-0b831af004c5                  N   

  date_facility_aware date_manufacturer_received date_of_event date_received  \
0                None                 2024-01-22    2024-01-22    2024-02-12   
1                None                 2024-01-18    2024-01-01    2024-02-12   
2                None                 2023-12-22    2023-12-22    2024-01-19   
3                None                 2024-01-24    2024-01-14    2024-02-12   
4          2024-02-19                       None          None    2024-03-15   

  date_report date_report_to_fda date_report_to_manufacturer  \
0  2024-02-12               None                        None   
1  2024-03-13     

  df = pd.read_sql_query(f'select * from {table}', conn)


                               event_id  \
0  137fc417-abd2-4c92-af98-f816d9d1b01c   
1  137fc417-abd2-4c92-af98-f816d9d1b01c   
2  7d7e75df-69d0-4344-a609-d8294fcaf9e1   
3  7d7e75df-69d0-4344-a609-d8294fcaf9e1   
4  b112bfb0-4306-498a-8ab9-76076641f4f9   

                                          brand_name catalog_number  \
0  T:SLIM X2 INSULIN PUMP WITH INTEROPERABLE TECH...        1005011   
1  T:SLIM X2 INSULIN PUMP WITH INTEROPERABLE TECH...        1005011   
2                        CONSTELLATION VISION SYSTEM     8065751150   
3                        CONSTELLATION VISION SYSTEM     8065751150   
4                                       AESPIRE 7100           None   

  date_received date_removed_flag date_returned_to_manufacturer  \
0    2024-02-12              None                          None   
1    2024-02-12              None                          None   
2    2024-02-12              None                          None   
3    2024-02-12              None             

In [10]:
print(df_recall.shape)
print(df_device_event.shape)
print(df_device.shape)

(54520, 36)
(100000, 78)
(199943, 30)


## preprocess relevant columns

From df_recall, we want to pull recalling_firm.
From df_device, we want to pull manufacturer_d_name.

In [11]:
def preprocess(df, col):
    """
    0. replace Nan with empty string
    1. lower
    2. remove non-ascii chars
    3. remove punctuation
    4. remove common legal business strings (like 'corp')
    """
    # remove NaN or None
    df = df.fillna('')
    
    stop_words = set(stopwords.words('english'))
    def f(x):
        x = x.lower() # step 1
        x = unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode() # step 2
        x = re.sub(r'[^\w\s]', '', x) # step 3
        x = basename(x) # step 4
        return x
        
    return df[col].apply(lambda x: f(x))
    
    

In [14]:
df_recall['preproc_recalling_firm'] = preprocess(df_recall, 'recalling_firm')
df_recall['preproc_device_name'] = preprocess(df_recall, 'device_name')

df_device['preproc_manufacturer_d_name'] = preprocess(df_device, 'manufacturer_d_name')
df_device['preproc_generic_name'] = preprocess(df_device, 'generic_name')

In [None]:
# function to vectorize two columns within desired df
def vectorize_text(df, col1, col2, method='tfidf'):
    """
    convert text data into numerical representations.
    Possible methods:
    - bag of words
    - tf-idf
    """
    if method == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        vectorizer = CountVectorizer()

    df['combined'] = df[col1] + ' | ' + df[col2]
    df['combined'].fillna('', inplace=True)
    X = vectorizer.fit_transform(df['combined'].values)

    return X, vectorizer # return vectorizer so it can be used later to assign cluster id to dfs
        
        

## cluster using linkage (agglomerative clustering)
General procedure:
1. given a specified df and col1 and col2, perform vectorization and hierarchial clustering
2. create cluster map


In [None]:
# this function applies clustering to group similar text and creates cluster map
def cluster(df, col1, col2):
    # drop Nans
    
    X, vectorizer = vectorize_text(df, col1, col2)
    link = linkage(X.toarray(), 'ward')
    return X, link

def show_cluster_map(X, link):
    sns.clustermap(X.toarray(), row_linkage=link, col_cluster=False)
    plt.show()
    


In [None]:
start = time.time()
X, link = cluster(df_recall, 'recalling_firm', 'device_name')
print(f'------------ elapsed train time: {time.time() - start} seconds -------------')

In [None]:
start = time.time()
show_cluster_map(X, link)
print(f'------------ elapsed plot time: {time.time() - start} seconds -------------')

In [None]:
########## I DID NOT GET THIS TO RUN #########