In [None]:
import pandas as pd
import pickle
import csv
import matplotlib.pyplot as plt
from simpletransformers.language_representation import RepresentationModel
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.metrics import pairwise_distances
import numpy as np

In [None]:
# Load the data from a pickle file into a DataFrame
data = pd.read_pickle('./predictions/all_primary_10.pkl')

# Optionally rename a column if needed (currently commented out)
# data = data.rename(columns={'pred_lqhq_data3' : 'predictions'})

# Print the column names of the DataFrame to understand the structure of the loaded data
print(data.columns)

# Define a list of categories
categories = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM', 'O']

# Create a dictionary mapping each category to its index
cat_indices = {category: index for index, category in enumerate(categories)}

# Iterate over each category and its corresponding index
for cat, index in cat_indices.items():
    # Create a new column in the DataFrame for each category, containing only the predictions
    # The predictions are extracted based on the index of the category in the 'pred_jenia_M3' column
    data[cat] = data['pred_jenia_M3'].apply(lambda x: x[index])
    
    # Create a new column in the DataFrame for each category's confidence scores
    # The confidence scores are extracted based on the index of the category in the 'confidence_jenia_M3_ft' column
    data[f'{cat}_confidence'] = data['confidence_jenia_M3_ft'].apply(lambda x: x[index])

# Filter the DataFrame to create new DataFrames for each category where the prediction is 1 (i.e., the category is predicted)
# For inspection if needed
only_adm = data[data['ADM'] == 1]
only_att = data[data['ATT'] == 1]
only_ber = data[data['BER'] == 1]
only_enr = data[data['ENR'] == 1]
only_etn = data[data['ETN'] == 1]
only_fac = data[data['FAC'] == 1]
only_ins = data[data['INS'] == 1]
only_mbw = data[data['MBW'] == 1]
only_stm = data[data['STM'] == 1]
only_o = data[data['O'] == 1]


In [None]:
# Create an empty DataFrame with categories as columns and statistics as index
stats_df = pd.DataFrame(index=['Mean','Max','Min','Median'], columns=categories)

# Iterate over each category to calculate the required statistics
for cat in categories:
    # Define the name of the confidence column for the current category
    conf_col = f"{cat}_confidence"
    
    # Check if both the category column and its corresponding confidence column exist in the DataFrame
    if cat in data.columns and conf_col in data.columns:
        # Filter the DataFrame to include only the rows where the current category is predicted as 1
        filtered_df = data[data[cat] == 1]

        # Calculate and store the mean of the confidence scores for the current category
        stats_df.loc['Mean', cat] = filtered_df[conf_col].mean()
        
        # Calculate and store the maximum of the confidence scores for the current category
        stats_df.loc['Max', cat] = filtered_df[conf_col].max()
        
        # Calculate and store the minimum of the confidence scores for the current category
        stats_df.loc['Min', cat] = filtered_df[conf_col].min()
        
        # Calculate and store the median of the confidence scores for the current category
        stats_df.loc['Median', cat] = filtered_df[conf_col].median()

# Transpose the DataFrame to have categories as rows and statistics as columns
stats_df = stats_df.transpose()

# Display the resulting DataFrame
stats_df


In [None]:
# Optional: Set pandas option to display all columns when printing DataFrames (currently commented out)
# pd.set_option('display.max_columns', None)

# Define the desired confidence range
conf_max = 0.23  # Set desired confidence upper limit
conf_min = 0.19  # Set desired confidence lower limit
cat = 'ATT'      # Set the category of interest

# Filter the DataFrame to include only rows where the confidence for the specified category falls within the desired range
filtered_conf_df = data[(data[f'{cat}_confidence'] >= conf_min) & (data[f'{cat}_confidence'] <= conf_max)]

# Sum the values for each category in the filtered DataFrame to get the total number of predictions for each category
conf_categories = filtered_conf_df[categories].sum()

# Further filter the DataFrame to include only rows where the specified category is positively predicted (i.e., prediction is 1)
only_positive_filtered_conf_df = filtered_conf_df[filtered_conf_df[cat] == 1]

# Display the resulting filtered DataFrame containing only positive predictions within the confidence range for the specified category
only_positive_filtered_conf_df


In [None]:
# Functionalized version of the operation above to apply in bulk

def get_instances(data, category, max_conf, min_conf):
    """
    Function to filter instances based on confidence scores for a specified category,
    and return a DataFrame of unique instances with specific columns.

    Parameters:
    data (DataFrame): The input DataFrame containing the data.
    category (str): The category of interest for filtering based on confidence scores.
    max_conf (float): The maximum confidence threshold.
    min_conf (float): The minimum confidence threshold.

    Returns:
    DataFrame: A DataFrame containing unique instances with specific columns.
    """
    # Filter the DataFrame to include only rows where the confidence for the specified category is within the desired range
    filtered_conf_df = data[(data[f'{category}_confidence'] >= min_conf) & (data[f'{category}_confidence'] <= max_conf)]
    
    # Calculate the sum of values for each category in the filtered DataFrame
    conf_categories = filtered_conf_df[categories].sum()
    
    # Further filter the DataFrame to include only rows where the specified category is positively predicted (i.e., prediction is 1)
    only_positive_filtered_conf_df = filtered_conf_df[filtered_conf_df[category] == 1]
    
    # Select specific columns and drop duplicates based on the 'text' column
    instances = only_positive_filtered_conf_df[['NoteID', 'text', 'pred_jenia_M3']].drop_duplicates(subset=['text'])

    return instances

# Define the confidence range
conf_max = 0.23  # Set desired confidence upper limit
conf_min = 0.19  # Set desired confidence lower limit

# Apply the get_instances function to each category to get unique instances within the specified confidence range
instances_adm = get_instances(data, 'ADM', conf_max, conf_min)
instances_att = get_instances(data, 'ATT', conf_max, conf_min)
instances_ber = get_instances(data, 'BER', conf_max, conf_min)
instances_enr = get_instances(data, 'ENR', conf_max, conf_min)
instances_etn = get_instances(data, 'ETN', conf_max, conf_min)
instances_fac = get_instances(data, 'FAC', conf_max, conf_min)
instances_ins = get_instances(data, 'INS', conf_max, conf_min)
instances_mbw = get_instances(data, 'MBW', conf_max, conf_min)
instances_stm = get_instances(data, 'STM', conf_max, conf_min)


In [None]:
# If merging individual slices above needed 

combined_high = pd.concat([instances_adm, instances_att, instances_ber, instances_enr, instances_etn, instances_fac, instances_ins, instances_mbw, instances_stm], axis=0, ignore_index=True)

# If we want to use the merged dataset as training data, we can apply further modifications like this
combined_high = combined_high.rename(columns={'predictions' : 'labels_10'})

### Redundancy Elimination Process

- Cosine Similarity Based Clustering via DBSCAN

In [None]:
# Initialize the RepresentationModel with the specified model type and name, and enable CUDA for GPU usage
model = RepresentationModel(
    model_type="roberta",
    model_name="./models/jenia_M1", # Desired model path
    use_cuda=True
)

def remove_short_words(text):
    """
    Function to remove words shorter than 4 characters from a given text.
    
    Parameters:
    text (str): The input text.

    Returns:
    str: The text with short words removed.
    """
    return ' '.join([word for word in text.split() if len(word) >= 4])

# Normalize the text in the 'text' column by stripping whitespace, converting to lowercase, and replacing multiple spaces with a single space
instances['normalized'] = instances['text'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True)

# Apply the remove_short_words function to the normalized text
instances['reduced'] = instances['normalized'].apply(remove_short_words)

# Remove duplicate rows based on the 'text' column
instances = instances.drop_duplicates(subset=['reduced'])

# Encode the reduced text using the model to generate embeddings
embeddings = model.encode_sentences(instances['reduced'].tolist(), combine_strategy="mean")

# Calculate the cosine distance matrix for the embeddings
distance_matrix = cosine_distances(embeddings)

# Perform DBSCAN clustering on the embeddings with specified parameters
clustering = DBSCAN(eps=0.02, min_samples=2, metric='cosine').fit(embeddings)
# Extract the labels assigned by the clustering algorithm
labels = clustering.labels_
#print(list(labels)) # Optionally print the labels


# Assign cluster labels to the instances DataFrame
instances['cluster'] = labels

# Get the unique cluster labels
unique_clusters = instances['cluster'].unique()

# Count the number of instances in each cluster
cluster_counts = instances['cluster'].value_counts()

# Select the top 10 instances from each cluster and reset the index
per_cluster = instances.groupby('cluster').head(10).reset_index()

# Create a DataFrame containing only 'NoteID' and 'text' columns for the top 10 instances in each cluster
clustered = per_cluster[['NoteID', 'text']]

# Reduce the instances DataFrame to contain only 'NoteID' and 'text' columns
instances = instances[['NoteID', 'text']]

# Optionally reset the pandas display option for maximum rows (currently commented out)
# pd.reset_option('display.max_rows', None)


In [None]:
# Save the 'combined_high' DataFrame to a pickle file
combined_high.to_pickle('./data/queried_data/high_conf_pseudo_m3-17k.pkl')

# Save the 'instances' DataFrame to a CSV file, which can be further prepared to send for annotation
instances.to_csv('./data/queried_data/2023_Notes_3rd_750k/notes_5th750k_att_0146-012-509-M2-ATT1.csv')

In [None]:
#This histogram shows the distribution of pairwise cosine distances between the embeddings
#

distances = pairwise_distances(embeddings, metric='cosine')
print('Max dist:', distances.max(), 'Min dist', distances.min())

plt.hist(distances.flatten(), bins=50)
plt.title('histogram of pairwise cosine similarity')
plt.xlabel('distance')
plt.ylabel('freq')
plt.show()

