Dataset Prep

In [239]:
import os
import pandas as pd
import json

# List of directories
directories = ['social_posts', 'documents', 'people']

# Initialize an empty dataframe
df = pd.DataFrame()

# For each directory
for directory in directories:
    # Specify the directory containing the JSON files
    folder_path = os.path.join(directory)

    # Get a list of all JSON files in the directory
    json_files = [pos_json for pos_json in os.listdir(folder_path) if pos_json.endswith('.json')]

    # For each JSON file
    for index, js in enumerate(json_files):
        with open(os.path.join(folder_path, js)) as json_file:
            json_text = json.load(json_file)

            # Remove the 'embeds' attribute
            if 'embeds' in json_text:
                del json_text['embeds']

            # Convert JSON to dataframe
            # Note: as of pandas 1.0.0, json_normalize has been moved to pandas.json_normalize
            json_df = pd.json_normalize(json_text)

            # Concatenate the dataframes
            df = pd.concat([df, json_df], ignore_index=True)
# Drop 'uuid' and 'text_id' columns in-place
df.drop(['uuid', 'text_id', 'link','media', 'date',], axis=1, inplace=True)

# Function to combine columns
def combine_columns(row):
    row = row.drop(['partition_name', 'embeds'])
    values = ', '.join(row.dropna().astype(str).values)
    return values

# Apply the function to each row
df['values'] = df.apply(combine_columns, axis=1)

# Convert 'values' to lowercase
df['values'] = df['values'].str.lower()

# Create new dataframe with required columns
df_new = df[['values', 'partition_name', 'embeds']]

# Exclude rows where 'values' is empty
df_new = df_new[df_new['values'] != ""]


df = df_new

partition_counts = df['partition_name'].value_counts()

def split_into_parts(row):
    text = row['values']
    tokens = text.split()
    length = len(tokens)
    if length > 300:
        parts = [
            ' '.join(tokens[:length//6]),
            ' '.join(tokens[length//6:(2*length)//6]),
            ' '.join(tokens[(2*length)//6:(3*length)//6]),
            ' '.join(tokens[(3*length)//6:(4*length)//6]),
            ' '.join(tokens[(4*length)//6:(5*length)//6]),
            ' '.join(tokens[(5*length)//6:])
        ]
    elif length >= 200:
        parts = [
            ' '.join(tokens[:length//4]),
            ' '.join(tokens[length//4:length//2]),
            ' '.join(tokens[length//2:(3*length)//4]),
            ' '.join(tokens[(3*length)//4:])
        ]
    elif length >= 90:
        parts = [
            ' '.join(tokens[:length//3]),
            ' '.join(tokens[length//3:(2*length)//3]),
            ' '.join(tokens[(2*length)//3:])
        ]
    else:
        parts = text
    return parts

# Create a temporary DataFrame for rows with 'social_posts_partition' or 'people_partition'
temp_df = df[df['partition_name'].isin(['social_posts_partition', 'people_partition'])].copy()

# Apply the function to split 'values' into parts
temp_df['values'] = temp_df.apply(split_into_parts, axis=1)

# Explode the 'values' into separate rows
temp_df = temp_df.explode('values')

# Exclude rows with 'social_posts_partition' or 'people_partition' from original DataFrame
df = df[~df['partition_name'].isin(['social_posts_partition', 'people_partition'])]

# Concatenate the original DataFrame with the temporary DataFrame
df = pd.concat([df, temp_df], ignore_index=True)

# Print the result
from math import ceil

# Get rows with 'documents_partition'
documents_partition_df = df[df['partition_name'] == 'documents_partition'].copy()

# Determine the number of rows per segment
# Determine the number of segments
num_segments = 189

# Determine the number of rows per segment
rows_per_segment = len(documents_partition_df) // num_segments

# If there are extra rows, add them to the last segment
extra_rows = len(documents_partition_df) % num_segments

segments = []
for i in range(num_segments):
    start_idx = i * rows_per_segment
    end_idx = (i + 1) * rows_per_segment if i != num_segments - 1 else (i + 1) * rows_per_segment + extra_rows
    segment = ' '.join(documents_partition_df['values'].iloc[start_idx:end_idx])
    segments.append(segment)

# Create a new DataFrame with the segments
new_documents_partition_df = pd.DataFrame({
    'values': segments,
    'partition_name': ['documents_partition'] * num_segments,
    'embeds': [None] * num_segments # You may want to handle 'embeds' differently
})

# Exclude rows with 'documents_partition' from original DataFrame
df = df[df['partition_name'] != 'documents_partition']

# Concatenate the original DataFrame with the new DataFrame
df = pd.concat([df, new_documents_partition_df], ignore_index=True)

# Print the result
partition_counts = df['partition_name'].value_counts()
print(partition_counts)
df.loc[df['partition_name'] == 'social_posts_partition', 'values'] = 'announcement is ' + df['values'].astype(str)

display(df)

partition_name
social_posts_partition    189
documents_partition       189
people_partition           43
Name: count, dtype: int64


Unnamed: 0,values,partition_name,embeds
0,"announcement is read | rev. fr. hernando coja,...",social_posts_partition,"[0.014242676086723804, -0.05774782598018646, 0..."
1,announcement is join the recollect community a...,social_posts_partition,"[0.009649019688367844, -0.0820484608411789, 0...."
2,announcement is #usjradvisory | all classes ar...,social_posts_partition,"[0.021807659417390823, -0.07729943096637726, 0..."
3,announcement is read | this is the second time...,social_posts_partition,"[0.005872523412108421, -0.09072086960077286, 0..."
4,announcement is ?????????????????????? ???? ??...,social_posts_partition,"[0.03550581634044647, -0.06325256079435349, 0...."
...,...,...,...
416,"jennifer d. paã±o, jeralden r. jumao-as, march...",documents_partition,
417,randy e. pederi mary gretchen f. chaves,documents_partition,
418,"teovy erdel bongcales, ariel balunan, loriemar...",documents_partition,
419,"steven elizalde, romeo patan mary gretchen chaves",documents_partition,


Embedding

In [240]:
import numpy as np
from scipy.spatial.distance import euclidean
import torch.nn.functional as F
from torch import Tensor
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
transformer_model = AutoModel.from_pretrained('intfloat/e5-large-v2')

def string_to_embedding(text: str):
#     tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
#     transformer_model = AutoModel.from_pretrained('intfloat/e5-large-v2') # Renamed variable here
    text = 'query: ' + text
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = transformer_model(**inputs) # Updated variable here
    embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)
    embeddings_list = embeddings.tolist()
    return embeddings_list[0]




In [241]:
tqdm.pandas(desc="Embedding values")
df['embeds'] = df['values'].progress_apply(string_to_embedding)


Embedding values: 100%|███████████████████████| 421/421 [01:35<00:00,  4.39it/s]


In [242]:
df

Unnamed: 0,values,partition_name,embeds
0,"announcement is read | rev. fr. hernando coja,...",social_posts_partition,"[0.018143652006983757, -0.04928550496697426, 0..."
1,announcement is join the recollect community a...,social_posts_partition,"[0.01835092157125473, -0.0741904154419899, 0.0..."
2,announcement is #usjradvisory | all classes ar...,social_posts_partition,"[0.03154794126749039, -0.06222093477845192, 0...."
3,announcement is read | this is the second time...,social_posts_partition,"[0.015062374994158745, -0.07406798750162125, 0..."
4,announcement is ?????????????????????? ???? ??...,social_posts_partition,"[0.034660838544368744, -0.058060452342033386, ..."
...,...,...,...
416,"jennifer d. paã±o, jeralden r. jumao-as, march...",documents_partition,"[-0.007246498018503189, -0.04587141051888466, ..."
417,randy e. pederi mary gretchen f. chaves,documents_partition,"[-0.004172301385551691, -0.047382697463035583,..."
418,"teovy erdel bongcales, ariel balunan, loriemar...",documents_partition,"[-0.01086932048201561, -0.04749814048409462, 0..."
419,"steven elizalde, romeo patan mary gretchen chaves",documents_partition,"[0.00848416518419981, -0.025945544242858887, 0..."


Training

In [243]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Transform the partition names into numerical labels
label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform(df['partition_name'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['embeds'].tolist(), y_labels, test_size=0.2, random_state=42)

# Initialize and train the model
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)

def rank_partitions(prompt_embedding):

    # Predict the class probabilities
    probabilities = svm_model.predict_proba([prompt_embedding])
    
    # Get the classes and their corresponding probabilities
    classes_and_probabilities = zip(label_encoder.classes_, probabilities[0])
    
    # Sort the classes by probability
    ranked_classes = sorted(classes_and_probabilities, key=lambda x: x[1], reverse=True)
    
    # Extract the class names, ignoring the probabilities
    ranked_class_names = [item[0] for item in ranked_classes]
    
    return ranked_class_names

# Example usage

Testing

In [248]:
# Example usage
prompt = "Jovelyn Cuizon"
result = rank_partitions(prompt)
print(result)

['documents_partition', 'people_partition', 'social_posts_partition']


In [252]:
from joblib import load

# Load the model and the label encoder
svm_model = load('svm_model.joblib')
label_encoder = load('label_encoder.joblib')

# Define the function
def rank_partitions(prompt_embedding):
    # Convert the prompt to an embedding
    
    # Predict the class probabilities
    probabilities = svm_model.predict_proba([prompt_embedding])
    
    # Get the classes and their corresponding probabilities
    classes_and_probabilities = zip(label_encoder.classes_, probabilities[0])
    
    # Sort the classes by probability
    ranked_classes = sorted(classes_and_probabilities, key=lambda x: x[1], reverse=True)
    
    # Extract the class names, ignoring the probabilities
    ranked_class_names = [item[0] for item in ranked_classes]
    
    return ranked_class_names


In [256]:
prompt = "What was announced based on the uniform measurement?"
result = rank_partitions(string_to_embedding(prompt.lower()))
print(result)


['social_posts_partition', 'documents_partition', 'people_partition']
