In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class ParenteralAnalyzer:
    def __init__(self, image_dir):
        self.image_dir = Path(image_dir)
        self.metadata = None
        self.image_data = {}
        
    def load_metadata(self, metadata_path):
        """Load and process metadata CSV file"""
        self.metadata = pd.read_csv(metadata_path)
        print("Metadata Summary:")
        print(f"Total medications: {len(self.metadata)}")
        print("\nColumns:", self.metadata.columns.tolist())
        return self.metadata.head()
        
    def analyze_image_properties(self):
        """Analyze basic properties of images in the directory"""
        image_properties = []
        
        for img_path in self.image_dir.glob("*.[jJ][pP][gG]"):
            try:
                with Image.open(img_path) as img:
                    properties = {
                        'filename': img_path.name,
                        'size': img.size,
                        'mode': img.mode,
                        'format': img.format,
                        'file_size': os.path.getsize(img_path) / 1024  # KB
                    }
                    image_properties.append(properties)
            except Exception as e:
                print(f"Error processing {img_path}: {e}")
                
        return pd.DataFrame(image_properties)
    
    def find_similar_names(self, threshold=0.7):
        """Find medications with similar names using TF-IDF and cosine similarity"""
        if self.metadata is None or 'medication_name' not in self.metadata.columns:
            raise ValueError("Metadata not loaded or missing medication_name column")
            
        vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))
        tfidf_matrix = vectorizer.fit_transform(self.metadata['medication_name'])
        
        similarity_matrix = cosine_similarity(tfidf_matrix)
        similar_pairs = []
        
        for i in range(len(similarity_matrix)):
            for j in range(i + 1, len(similarity_matrix)):
                if similarity_matrix[i, j] > threshold:
                    similar_pairs.append({
                        'med1': self.metadata['medication_name'].iloc[i],
                        'med2': self.metadata['medication_name'].iloc[j],
                        'similarity': similarity_matrix[i, j]
                    })
                    
        return pd.DataFrame(similar_pairs).sort_values('similarity', ascending=False)
    
    def visualize_metadata_distribution(self, column):
        """Visualize the distribution of a metadata column"""
        if self.metadata is None:
            raise ValueError("Metadata not loaded")
            
        plt.figure(figsize=(12, 6))
        if self.metadata[column].dtype in ['int64', 'float64']:
            sns.histplot(data=self.metadata, x=column)
        else:
            value_counts = self.metadata[column].value_counts()
            sns.barplot(x=value_counts.index, y=value_counts.values)
            plt.xticks(rotation=45, ha='right')
        
        plt.title(f'Distribution of {column}')
        plt.tight_layout()
        plt.show()                                                                                                             

In [3]:
analyzer = ParenteralAnalyzer("./Parenterals")
# 
# metadata file path
# metadata = analyzer.load_metadata("path/to/metadata.csv")
image_properties = analyzer.analyze_image_properties()
similar_meds = analyzer.find_similar_names(threshold=0.7)

ValueError: Metadata not loaded or missing medication_name column