In [None]:
from mlxtend.preprocessing import TransactionEncoder 
from mlxtend.frequent_patterns import apriori, association_rules 

import cv2 
import dlib 
import face_recognition 
from mtcnn.mtcnn import MTCNN 

import numpy as np
import pandas as pd 
import glob 
from collections import defaultdict, Counter 
import os 
from PIL import Image 
from numpy import savez_compressed 
from numpy import asarray 
from os import listdir 
import shutil 
from sklearn.model_selection import train_test_split 

from IPython.display import display_html 
from IPython.display import display, HTML 
from matplotlib import pyplot as plt 
from matplotlib.patches import Rectangle 
from matplotlib.patches import Circle 

import re 
from itertools import combinations 
from ast import literal_eval 

This code utilizes various libraries for data processing, machine learning, and computer vision tasks. Libraries like mlxtend are used for pattern mining and association rule learning, while cv2 (OpenCV), dlib, face_recognition, and MTCNN provide robust tools for face detection and recognition. NumPy and pandas handle numerical computations and data manipulation, respectively. Modules like glob, os, and shutil facilitate file and directory management, and Pillow (PIL) is used for image processing. Scikit-learn supports machine learning model evaluation, while matplotlib aids in data visualization. The code also uses modules such as re for regular expressions, itertools for combinations, and ast for safe evaluation of Python literals.

## EXTRACT DATA USING FACE RECOGNITION

In [None]:
embedding_dir = f'./known_embeddings_{pengguna_user}'
os.makedirs(embedding_dir, exist_ok=True)

def save_embedding(person_id, face_encoding):
    number = person_id.split('_')[-1]
    embedding_path = os.path.join(embedding_dir, f'person_{number}.npy')
    np.save(embedding_path, face_encoding)

def load_embeddings():
    embeddings = {}
    for file in glob.glob(os.path.join(embedding_dir, '*.npy')):
        person_id = os.path.splitext(os.path.basename(file))[0]
        embeddings[person_id] = np.load(file)
    return embeddings

def find_best_match(face_encoding, known_embeddings):
    best_match_id = None
    min_distance = float('inf')

    for person_id, known_encoding in known_embeddings.items():
        distance = np.linalg.norm(face_encoding - known_encoding)
        if distance < min_distance:
            min_distance = distance
            number = person_id.split('_')[-1]
            best_match_id = number
    
    return best_match_id if min_distance < 0.6 else None  # Adjust threshold as needed

def process_image(image_path, output_base_dir, known_embeddings, photo_id):
    image = cv2.imread(image_path)
    face_locations = face_recognition.face_locations(image)
    face_encodings = face_recognition.face_encodings(image, face_locations)
    
    person_ids = []
    person_names = []

    for i, (face_location, face_encoding) in enumerate(zip(face_locations, face_encodings)):
        top, right, bottom, left = face_location
        face_image = image[top:bottom, left:right]

        person_id = find_best_match(face_encoding, known_embeddings)

        if not person_id:
            person_id = f'{len(known_embeddings) + 1}'
            save_embedding(person_id, face_encoding)
            known_embeddings[person_id] = face_encoding
            
        person_dir = os.path.join(output_base_dir, f'person_{person_id}')
        os.makedirs(person_dir, exist_ok=True)

        face_index = len(glob.glob(os.path.join(person_dir, "*.jpg"))) + 1
        face_filename = os.path.join(person_dir, f'person_{person_id}_face_{face_index}.jpg') 
        cv2.imwrite(face_filename, face_image)

        person_face_id = f"{person_id}_{face_index}"

        person_ids.append(person_face_id)
        person_names.append(f'person_{person_id}_face_{face_index}')  # Updated naming

    return {
        'photo_id': photo_id,
        'photo_name': os.path.basename(image_path),
        'person_id': ','.join(person_ids),
        'person': person_names
    }

def process_images(image_directory, output_base_dir, existing_dataset_path):
    image_paths = glob.glob(os.path.join(image_directory, '*.jpg'))
    
    if os.path.exists(existing_dataset_path):
        existing_df = pd.read_csv(existing_dataset_path)
        processed_photos = set(existing_df['photo_name'])
        start_photo_id = existing_df['photo_id'].max() + 1
    else:
        existing_df = pd.DataFrame()
        processed_photos = set()
        start_photo_id = 1
    
    dataset = []
    known_embeddings = load_embeddings()
    current_photo_id = start_photo_id

    for image_path in image_paths:
        photo_name = os.path.basename(image_path)
        if photo_name in processed_photos:
            continue

        result = process_image(image_path, output_base_dir, known_embeddings, current_photo_id)
        dataset.append(result)
        current_photo_id += 1

    new_df = pd.DataFrame(dataset)
    combined_df = pd.concat([existing_df, new_df])
    if new_df.empty:
        combined_df.to_csv(existing_dataset_path, index=False)
        print("tidak ada tambahan foto baru")
    else:
        combined_df.to_csv(existing_dataset_path, index=False)
        print(f"ada tambahan foto baru sebanyak {len(new_df)} foto, data foto menjadi {len(combined_df)}")
    return combined_df

This code handles face detection, recognition, and dataset management for images. It uses pre-trained face encodings to identify known faces or assign new IDs to unknown ones by comparing face encodings and saving new embeddings when needed. Detected faces are cropped, saved in corresponding directories, and named with unique identifiers. The code processes all images in a specified directory, skipping already-processed ones, and updates a dataset CSV file with information about the new images and their detected faces, ensuring that all data is up-to-date and consistent with the current state of the image repository.

In [13]:
pengguna_user = '( GALLERY_NAME )' #PLEASE CHANGE STRING NAME TO YOUR ACTUAL GALLERY NAME IN THIS CELL

In [3]:
image_dir = f'./{pengguna_user}'
output_base_dir = f'./clusterExtract/{pengguna_user}'
existing_dataset_path = f'./face_dataset_{pengguna_user}_before_koreksi_sistem.csv'

In [None]:
df_process = process_images(image_dir, output_base_dir, existing_dataset_path)

In [None]:
df_correct =  df_process.dropna()
df_correct = df_correct[df_correct['person'] != '[]']
df_correct

In [None]:
df_correct.to_csv(existing_dataset_path, index=False)

To use this code, follow these steps:

1. **Set User Parameters**: Define the user identifier (`pengguna_user`) to specify which dataset and directories to use.
2. **Define Directories and Paths**: Set the `image_dir` for input images, `output_base_dir` for saving processed images, and `existing_dataset_path` for the CSV file that tracks the face data.
3. **Run the Processing Function**: Call `process_images(image_dir, output_base_dir, existing_dataset_path)` to detect and recognize faces in the specified directory, save them in organized folders, and update the dataset CSV.
4. **Clean the Dataset**: Filter the processed DataFrame (`df_process`) to remove empty or invalid entries, then save the cleaned dataset back to the CSV file.

By following these steps, you will maintain an updated dataset of recognized faces and newly detected ones.

## CORRECTION TOOLS

In [6]:
def get_highest_face_index(directory):
    if not os.path.exists(directory):
        return 0
    
    face_files = glob.glob(os.path.join(directory, "*.jpg"))
    
    if not face_files:
        return 0
    
    expected_prefix = os.path.basename(directory).split('_')[-1]
    
    indices = []
    for face in face_files:
        filename = os.path.basename(face)
        
        if filename.startswith(f"person_{expected_prefix}"):
            match = re.search(r'face_(\d+)', filename)
            if match:
                indices.append(int(match.group(1)))
    
    return max(indices, default=0)


def rename_face_images(output_base_dir, old_name, new_name):
    old_person_id = old_name.split('_')[1]
    new_person_id = new_name.split('_')[1]

    old_path = os.path.join(output_base_dir, f'person_{new_person_id}', f"{old_name}.jpg")
    new_path = os.path.join(output_base_dir, f'person_{new_person_id}', f"{new_name}.jpg")

    os.rename(old_path, new_path)


def check_folder_consistency(output_base_dir, df):
    print("Checking folder consistency...")
    count = 0

    for person_folder in glob.glob(os.path.join(output_base_dir, 'person_*')):
        dir_folder = person_folder
        person_id = os.path.basename(person_folder).split('_')[-1]
        print(f"Checking folder: {person_folder}")

        for face_image in glob.glob(os.path.join(person_folder, '*.jpg')):
            face_name = os.path.basename(face_image)
            base_name = face_name.split('_')[0]
            person_id_name = face_name.split('_')[1]
            face_name = f"{base_name}_{person_id_name}"
            expected_person_id = f'person_{person_id}'

            if face_name != expected_person_id:
                old_person_id = face_name.split('_')[1]
                old_person_id_2 = face_image.split('_')[4][:-4]
                old_img_id = f'{old_person_id}_{old_person_id_2}'
                print(f"File person_{old_person_id} does not match expected person_id {expected_person_id}")
                expected_person_id_2 = get_highest_face_index(dir_folder) + 1
                new_img_id = f'{person_id}_{expected_person_id_2}'

                def match_person_id(row, old_img_id):
                    return old_img_id in row['person_id'].split(',')

                matching_rows = df[df.apply(lambda row: match_person_id(row, old_img_id), axis=1)]
                if not matching_rows.empty:
                    old_person_name = f'{face_name}_face_{old_person_id_2}'
                    new_person_name = f'{expected_person_id}_face_{expected_person_id_2}'
                    rename_face_images(output_base_dir, old_person_name, new_person_name)

                    for index, row in matching_rows.iterrows():
                        person_list = eval(row['person']) 
                        person_id_list = row['person_id'].split(',')
                        updated_person_list = [new_person_name if p == old_person_name else p for p in person_list]
                        updated_person_id_list = [new_img_id if p == old_img_id else p for p in person_id_list]
                        update_df_id = ','.join(updated_person_id_list)
                        df.at[index, 'person'] = str(updated_person_list)
                        df.at[index, 'person_id'] = str(update_df_id)
                        print(f"Updated entry: {old_person_name} -> {new_person_name}")
                        print(f"From photo_id: {matching_rows['photo_id'].to_string(index=False)}")
    
    return df

This code ensures consistency between folder names and image file names in a face recognition dataset. It checks each person's folder to verify that image filenames match their corresponding directory and renames any mismatched files to maintain proper formatting. It updates the associated DataFrame by replacing old IDs and names with corrected ones, ensuring all data entries accurately reflect the current state of the images and folders. The process helps maintain a clean and organized dataset for reliable face recognition and identification tasks.

In [None]:
df_correct = pd.read_csv(f'./face_dataset_{pengguna_user}_before_koreksi_sistem.csv')
df_correct_temp = df_correct.copy()

In [None]:
df_correct_after = check_folder_consistency(output_base_dir, df_correct_temp)

In [8]:
df_correct_after.to_csv(f'./face_dataset_{pengguna_user}_after_koreksi_sistem.csv', index=False)

To use this code:

1. **Prepare Face Clusters**: Before executing the code, ensure that face clusters are correctly assigned to the corresponding person folders. For instance, if you find faces in the `person_1` folder that actually belong to `person_13`, move those images to the `person_13` folder. Perform this check across all folders. This process is time-consuming and requires consistent effort but is crucial for accurate results due to the current limitations of face detection methods.

2. **Load the Initial Dataset**: Load the dataset (`face_dataset_{pengguna_user}_before_koreksi_sistem.csv`) into a DataFrame.

3. **Check and Correct Folder Consistency**: Run the `check_folder_consistency` function to verify that face image filenames match their corresponding person folders. This function will rename mismatched files and update the DataFrame accordingly.

4. **Save the Corrected Dataset**: Save the updated DataFrame to a new CSV file (`face_dataset{pengguna_user}_after_koreksi_sistem.csv`) to ensure that the dataset reflects all corrections.

By following these steps, the dataset will remain consistent and accurate for face recognition tasks.

In [None]:
import ast

def count_person_2_face(person_list_str):
    person_list = ast.literal_eval(person_list_str)
    return sum(bool(re.match(r'person_2_face_\d+', person)) for person in person_list)

total_count_before = df_correct['person'].apply(count_person_2_face).sum()
total_count_after = df_correct_after['person'].apply(count_person_2_face).sum()

print(f'JUMLAH Before: {total_count_before}')
print(f'JUMLAH After: {total_count_after}')

This code compares the number of occurrences of `person_2_face` in the dataset before and after running the consistency check and correction.

1. **Define a Counting Function**: `count_person_2_face` parses the `person` column entries to count how many times a pattern like `person_2_face_x` appears.
2. **Calculate Total Counts**: It calculates the total count of `person_2_face` entries before (`df_correct`) and after (`df_correct_after`) the consistency check.
3. **Display Results**: The results are printed to show the differences, helping to confirm whether the data corrections were applied properly.

**Example Output**:
```
Before: 15
After: 13
```

This output indicates that there were 15 instances of `person_2_face` before corrections and 13 after, suggesting some inconsistencies were resolved.

## DATA CLEANING

In [11]:
def remove_after_second_underscore(person_list):
    modified_list = []
    for person in person_list:
        parts = person.split('_')
        if len(parts) > 2:
            modified_list.append('_'.join(parts[:2])) 
        else:
            modified_list.append(person)
    return modified_list

def remove_after_first_underscore(person_list):
    modified_list = []
    for person in person_list:
        parts = person.split('_')
        if len(parts) > 1:
            modified_list.append(parts[0]) 
        else:
            modified_list.append(person)
    return modified_list

def modify_person_column(current_df):
    def process_person_column_2(person_str):
        person_list = person_str.split(',')
        modified_person_list = remove_after_second_underscore(person_list)
        return str([f"{p.strip()}" for p in modified_person_list])

    def process_person_column_1(person_str):
        person_list = person_str.split(',')
        modified_person_list = remove_after_first_underscore(person_list)
        return ', '.join(modified_person_list)
    
    current_df['person'] = current_df['person'].apply(lambda x: process_person_column_2(x.strip("[]").replace("'", "")))
    current_df['person_id'] = current_df['person_id'].apply(process_person_column_1)
    return current_df


def remove_values_from_list(lst, values):
    return [item for item in lst if item not in values]


def remove_values_from_string(s, values):
    return ', '.join(item for item in s.split(', ') if item not in values)


def list_to_string(lst):
    return str(lst).replace('"', '')

def clean_data(df, values_to_remove, ids_to_remove):
    df['person'] = df['person'].apply(lambda x: remove_values_from_list(x, values_to_remove))
    df['person_id'] = df['person_id'].apply(lambda x: remove_values_from_string(x, ids_to_remove))
    df['person'] = df['person'].apply(list_to_string)
    return df


if pengguna_user == '1':
    values_to_remove = {'person_40'}
    ids_to_remove = {'40'}
else:
    values_to_remove = {'person_999'}
    ids_to_remove = {'999'}


This code is designed to clean and standardize the person and person_id columns in a dataset. It modifies these columns by removing unwanted parts after the first or second underscore in each entry, depending on the context. The functions remove_after_second_underscore and remove_after_first_underscore handle these modifications, while modify_person_column applies them to the DataFrame. Additionally, the clean_data function removes specific unwanted values (e.g., person_40 or person_999) from the columns based on the user context (pengguna_user). This ensures the dataset is properly formatted and cleaned of irrelevant or redundant entries.

In [None]:
df = pd.read_csv(f'./face_dataset_{pengguna_user}_after_koreksi_sistem.csv')
df_temp = df_correct_after.copy()

In [12]:
df_cleaned = modify_person_column(df_temp)
df_cleaned['person'] = df_cleaned['person'].apply(literal_eval)
df_cleaned = clean_data(df_cleaned, values_to_remove, ids_to_remove)

In [None]:
df_cleaned.to_csv(f'./face_dataset_{pengguna_user}_after_koreksi_sistem_clean.csv', index=False)
df_cleaned

To use this code:

1. **Load the Dataset**: Read the dataset from `face_dataset_{pengguna_user}_after_koreksi_sistem.csv`.
2. **Copy and Modify Data**: Create a temporary copy of the DataFrame, then apply `modify_person_column` to standardize and clean the `person` column.
3. **Apply Additional Cleaning**: Convert the `person` column entries back from string to list format using `literal_eval`, then apply `clean_data` to remove specific unwanted values.
4. **Save Cleaned Data**: Save the cleaned DataFrame to a new CSV file, `face_dataset{pengguna_user}_after_koreksi_sistem_clean.csv`.

This process ensures the dataset is standardized and cleaned for accuracy and consistency.

In [None]:
import ast

def count_person_2_face(person_list_str):
    person_list = ast.literal_eval(person_list_str)
    return sum(bool(re.match(r'person_40_face_\d+', person)) for person in person_list)

total_count_before = df_correct_after['person'].apply(count_person_2_face).sum()
total_count_after = df_cleaned['person'].apply(count_person_2_face).sum()

print(f'Before: {total_count_before}')
print(f'After: {total_count_after}')

This code calculates and compares the number of occurrences of `person_40_face_x` in the dataset before and after data cleaning.

1. **Define Counting Function**: `count_person_2_face` parses the `person` column to count instances of `person_40_face_x` where `x` is a number.
2. **Calculate Totals**: It computes the total counts of `person_40_face_x` entries in the dataset before (`df_correct_after`) and after (`df_cleaned`) cleaning.
3. **Display Results**: The results are printed to show how many such instances were present before and after cleaning.

**Example Output**:
```
Before: 25
After: 10
```

This output shows that there were 25 occurrences of `person_40_face_x` before cleaning and 10 after, indicating the data has been successfully cleaned of unwanted entries.

#### ASSOCIATION RULES

In [16]:
def safe_eval(val):
    if isinstance(val, str):
        return eval(val)
    return val

In [8]:
import inspect

def get_variable_names(df_list):
    current_frame = inspect.currentframe()
    caller_frame = current_frame.f_back
    
    variables = caller_frame.f_locals

    data_names = [name for name, value in variables.items() if value in df_list]
    
    return data_names


In [17]:
def find_common_rules(df_list, min_support=0.03, min_threshold=0.9):
    rules_dict = {}
    
    data_names = get_variable_names(df_list)
    
    if len(data_names) > 1:
        for i, df in enumerate(df_list):
            te = TransactionEncoder()
            transactions = df['person'].apply(safe_eval).tolist()
            te_ary = te.fit(transactions).transform(transactions)
            df_transformed = pd.DataFrame(te_ary, columns=te.columns_)
            frequent_itemsets = apriori(df_transformed, min_support=min_support, use_colnames=True)
            rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_threshold)
            rules['source'] = data_names[i]
            rules_dict[i] = rules

        for i in range(len(rules_dict) - 1):
            for j in range(i + 1, len(rules_dict)):
                common_rules = pd.merge(rules_dict[i], rules_dict[j], on=['antecedents', 'consequents'], suffixes=('_x', '_y'))
                if not common_rules.empty:
                    result = pd.DataFrame({
                        'antecedents': common_rules['antecedents'],
                        'consequents': common_rules['consequents'],
                        'antecedent support': common_rules['antecedent support_x'],
                        'consequent support': common_rules['consequent support_x'],
                        'support': common_rules['support_x'],
                        'confidence': common_rules['confidence_x'],
                        'lift': common_rules['lift_x'],
                    })
                    result['sources'] = data_names[i] + ', ' + data_names[j]
                    photo_id_j = df_list[j]['photo_id'].max()
                    result['photo_id'] = str(photo_id_j)
                    return result
    else:
        te = TransactionEncoder()
        transactions = df_list[0]['person'].apply(safe_eval).tolist()
        te_ary = te.fit(transactions).transform(transactions)
        df_transformed = pd.DataFrame(te_ary, columns=te.columns_)
        frequent_itemsets = apriori(df_transformed, min_support=min_support, use_colnames=True)
        rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_threshold)
        
        result = pd.DataFrame({
            'antecedents': rules['antecedents'],
            'consequents': rules['consequents'],
            'antecedent support': rules['antecedent support'],
            'consequent support': rules['consequent support'],
            'support': rules['support'],
            'confidence': rules['confidence'],
            'lift': rules['lift'],
        })
        result['sources'] = data_names[0]
        result['photo_id'] = df_list[0]['photo_id'].max()
        return result
            
    return None


In [11]:
def load_existing_rules(rules_path):
    if os.path.exists(rules_path):
        rules_df = pd.read_csv(rules_path)
        return rules_df
    else:
        return pd.DataFrame()

def save_new_rules(rules_df, rules_path):
    columns = [
        'antecedents', 'consequents', 'antecedent support',
        'consequent support', 'support', 'confidence', 'lift',
        'sources', 'photo_id'
    ]
    
    if rules_df is None:
        rules_df = pd.DataFrame(columns=columns)
    
    if not rules_df.empty:
        rules_save = rules_df
    else:
        rules_save = pd.DataFrame(columns=columns)
    
    rules_save.to_csv(rules_path, index=False)

def check_rules_changes(rules_fix, existing_rules_df):
    if rules_fix is not None:
        if (rules_fix[['antecedents', 'consequents']].equals(existing_rules_df[['antecedents', 'consequents']])):
            print("Ada foto tambahan, tetapi rules baru sama dengan rules lama.")
            return True
        else:
            print(f"Rules baru ditemukan dan disimpan. Menggunakan dataset dari {rules_fix['sources'].iloc[0]}.")
            return True
    else:
        print("Ada foto tambahan, tetapi rules baru sama dengan rules lama.")
        return False

def check_and_update_rules(df, rules_path, min_support, min_threshold):
    modified_df_clean = df

    existing_rules_df = load_existing_rules(rules_path)

    current_photo_count = modified_df_clean['photo_id'].nunique()

    if existing_rules_df.empty:
        previous_photo_count = 0
    else:
        previous_photo_count = existing_rules_df['photo_id'].max()

    photo_count_increase = current_photo_count - previous_photo_count

    percentage_increase = (photo_count_increase / previous_photo_count) * 100 if previous_photo_count > 0 else 100

    if percentage_increase > 0.3:
        data_1_4, _ = train_test_split(modified_df_clean, train_size=0.25, random_state=42)
        data_3_4, _ = train_test_split(modified_df_clean, train_size=0.75, random_state=42)
        data_1_2, _ = train_test_split(modified_df_clean, train_size=0.50, random_state=42)
        data_full = modified_df_clean

        """
        IN THIS SECTION YOU CAN CHOOSE, THE AMOUNT OF DATA YOU 
        WANT TO CREATE RULES, THE LESS REDUCE COMPUTATIONAL 
        WORKLOAD BUT CAUSE LESS VARIATION OF RULES, THE MORE 
        INCREASE COMPUTATIONAL WORKLOAD BUT CAUSE VARIATION OF RULES
        """
        
        # df_list = [data_1_4, data_3_4, data_1_2, data_full]
        df_list = [data_full]

        """
        YOU JUST HAVE TO UNCOMMENT THE UNNECESSARY PARTS OF ONE OF THE Df_list VARIABLES
        """
        
        rules_fix = find_common_rules(df_list, min_support=min_support, min_threshold=min_threshold)

        rules_fix.to_csv('./rules_temp.csv', index=False)
        if existing_rules_df.empty:
            print("Rules baru telah dibuat")
            save_new_rules(rules_fix, rules_path)
            return rules_fix, modified_df_clean
        else:
            rules_fix = pd.read_csv('./rules_temp.csv')
            if check_rules_changes(rules_fix, existing_rules_df):
                save_new_rules(rules_fix, rules_path)
                return rules_fix, modified_df_clean
            else:
                existing_rules_df["photo_id"] = rules_fix["photo_id"].iloc[0]
                return existing_rules_df, modified_df_clean
    else:
        print("Menggunakan rules lama. Tidak ada perubahan signifikan dalam jumlah foto.")
        return existing_rules_df, modified_df_clean


This script is designed to manage and update rules based on data changes. It starts by loading existing rules and checking for new data. If the number of photos increases significantly, it splits the dataset into various parts and generates new rules using the Apriori algorithm. The function `find_common_rules` creates these rules and compares them with the existing ones to determine if they have changed. The updated rules are then saved, and appropriate messages are printed to indicate whether new rules were added or if the old rules remain valid. This process ensures that the rules stay current with the data while managing computational workload efficiently.

In [None]:
df_cleaned = pd.read_csv(f'./face_dataset_{pengguna_user}_after_koreksi_sistem_clean.csv')

In [None]:
# Example usage
support_rules = 0.03
confidence_rules = 0.7
path_rules = f'./rules_file/{pengguna_user}/filtered_rules_{pengguna_user}_{support_rules}_{confidence_rules}_test.csv'
rules_df, data_photo = check_and_update_rules(df_cleaned, rules_path=path_rules, min_support=support_rules, min_threshold=confidence_rules)
filtered_rules = rules_df
print(f"Rules yang dipilih berasal dari {rules_df["sources"].iloc[0]}  membuat sebanyak {len(filtered_rules)} rules")
filtered_rules

To use this script:

1. **Load Cleaned Data**: Read the cleaned face dataset from the specified CSV file.
2. **Set Parameters**: Define `support_rules` and `confidence_rules` for filtering rules, and specify the `path_rules` for saving the updated rules.
3. **Update Rules**: Call `check_and_update_rules()` to check for significant changes in the dataset, generate or update rules, and save them to the specified path.
4. **View Results**: Print the source of the rules and the number of rules created.

Example: This script will update rules based on the provided dataset and parameters, saving the results and displaying how many new rules were generated.

### CLASSIFICATION OF INDIVIDUAL FACIAL CORRELATION BETWEEN PHOTOS

In [26]:
def normalize_rule(antecedents, consequents):
    """ Normalize the rule to a canonical form. """
    return (frozenset(antecedents), frozenset(consequents))

def group_correlated_rules(rules_df):
    grouped_rules = []
    used_indices = set()
    rule_map = {}

    for i, row_i in rules_df.iterrows():
        if i in used_indices:
            continue
        
        new_antecedents = set(row_i['antecedents'])
        new_consequents = set(row_i['consequents'])

        for j, row_j in rules_df.iterrows():

            if i == j or j in used_indices:
                continue

            if not new_antecedents.isdisjoint(row_j['antecedents']) and not new_consequents.isdisjoint(row_j['consequents']):
                new_antecedents.update(row_j['antecedents'])
                new_consequents.update(row_j['consequents'])
                used_indices.add(j)
        
        # Normalize the rule
        rule = normalize_rule(new_antecedents, new_consequents)
        grouped_rules.append(rule)
        used_indices.add(i)

    final_rules = []
    seen_rules = set()

    for rule in grouped_rules:
        antecedents, consequents = rule
        inverse_rule = normalize_rule(consequents, antecedents)
        
        if inverse_rule not in seen_rules:
            final_rules.append(rule)
            seen_rules.add(rule)
    
    return final_rules


In [29]:
def apply_rules(row):
    persons = set(row['person'])
    albums = set().
    for idx, (antecedents, consequents) in enumerate(grouped_rules):
        for antecedent in antecedents:
            for consequent in consequents:
                if antecedent != consequent and consequent in persons and antecedent in persons:
                    albums.add(idx)
                    break 
    
    return list(albums)

The provided code performs the following tasks:

1. **Normalize Rules**: `normalize_rule` converts rules into a canonical form by using `frozenset` for both antecedents and consequents.
2. **Group Correlated Rules**: `group_correlated_rules` groups similar rules by merging antecedents and consequents of correlated rules and removes duplicates and inverse rules.
3. **Apply Rules**: `apply_rules` checks each row of the dataset to see if it matches any grouped rules, assigning relevant albums based on rule correlations.

In summary, the script normalizes and groups correlated rules, removes redundant and inverse rules, and applies these rules to classify data entries, thereby facilitating more effective rule-based analysis.

In [None]:
data_photo = pd.read_csv(f'./face_dataset_{pengguna_user}_after_koreksi_sistem_clean.csv')
df_photos = data_photodf_photos['person'] = df_photos['person'].apply(safe_eval)
filtered_rules['antecedents'] = filtered_rules['antecedents'].apply(safe_eval)
filtered_rules['consequents'] = filtered_rules['consequents'].apply(safe_eval)
grouped_rules = group_correlated_rules(filtered_rules)
grup_rule = pd.DataFrame(grouped_rules, columns=['IF', 'THEN'])
grup_rule

In [30]:
# Apply the function to create albums
df_photos['albums'] = df_photos.apply(apply_rules, axis=1)
df_photos.albums

In [None]:
df_photos

Here’s a concise explanation for using the provided code:

1. **Load and Prepare Data**: Read the cleaned photo dataset and apply `safe_eval` to process the 'person' column, and similarly process the antecedents and consequents of the filtered rules.

2. **Group Correlated Rules**: Use `group_correlated_rules` to normalize and group the rules, removing duplicates and inverse rules.

3. **Create Albums**: Apply the grouped rules to classify the photos into albums using the `apply_rules` function, which assigns relevant album indices based on rule matches.

In summary, this script loads and processes photo data, groups correlated rules, and classifies photos into albums according to these rules.

### GROUPING PHOTOS BASED ON CLASSIFICATION

In [34]:
def calculate_similarity(list1, list2):
    counter1 = Counter(list1)
    counter2 = Counter(list2)
    common_elements = sum((counter1 & counter2).values())
    total_elements = max(len(list1), len(list2))
    similarity = common_elements / total_elements
    return similarity

def cleanse_album_df(album_df):
    rows_to_drop = set()
    for (idx1, row1), (idx2, row2) in combinations(album_df.iterrows(), 2):
        album_similarity = calculate_similarity(row1['album'], row2['album'])
        photo_id_similarity = calculate_similarity(row1['photo_id'], row2['photo_id'])

        """
        IN THIS SECTION, PART OF BRACHING "IF" YOU CAN CHANGE THE THRESHOLD, BECAUSE IN THIS PART IT IS A TRIAL AND ERROR EXPERIMENT
        """
        if album_similarity > 0.3:
            if len(row1['photo_id']) > len(row2['photo_id']):
                larger_idx, smaller_idx = idx1, idx2
            else:
                larger_idx, smaller_idx = idx2, idx1

            larger_album_id = album_df.loc[larger_idx, 'photo_id']
            smaller_album_id = album_df.loc[smaller_idx, 'photo_id']

            larger_album = album_df.loc[larger_idx, 'album']
            smaller_album = album_df.loc[smaller_idx, 'album']
            
            unique_photo_ids = set(smaller_album_id) - set(larger_album_id)
            album_df.at[larger_idx, 'photo_id'] = list(set(larger_album_id) | unique_photo_ids)

            unique_photo = set(smaller_album) - set(larger_album)
            album_df.at[larger_idx, 'album'] = list(set(larger_album) | unique_photo)
            
            rows_to_drop.add(smaller_idx)
    
    cleansed_df = album_df.drop(index=rows_to_drop).reset_index(drop=True)
    return cleansed_df


The provided code defines two functions: `calculate_similarity` and `cleanse_album_df`. The `calculate_similarity` function measures the similarity between two lists by comparing their common elements relative to the larger list. The `cleanse_album_df` function uses this similarity measure to clean an album DataFrame. It iterates through pairs of rows, and if the similarity of their 'album' and 'photo_id' values exceeds 0.3, it merges the albums, consolidating photo IDs and album entries into the larger album and marking the smaller album for deletion. The final output is a cleaned DataFrame with redundant albums removed and their contents consolidated.

In [None]:
album_dict = {}
for _, row in df_photos.iterrows():
    album_ids = row['albums']
    for album_id in album_ids:
        if album_id not in album_dict:
            album_dict[album_id] = {'persons': set(), 'photo_ids': []}
        album_dict[album_id]['photo_ids'].append(row['photo_id'])
        album_dict[album_id]['persons'].update(row['person'])


In [35]:
data_album = {
    'album_id': [],
    'album': [],
    'photo_id': []
}

In [36]:
for album_id, info in album_dict.items():

    data_album['album_id'].append(album_id)
    data_album['album'].append(list(info['persons']))
    data_album['photo_id'].append(info['photo_ids'])


In [None]:
album_df = pd.DataFrame(data_album)
cleansed_album_df = cleanse_album_df(album_df)
cleansed_album_df

The provided code performs the following steps:

1. **Create Album Dictionary**: It iterates over each row in `df_photos` to build a dictionary (`album_dict`). Each key is an album ID, and the value contains a set of persons and a list of photo IDs associated with that album.

2. **Prepare Data for DataFrame**: It prepares data for a new DataFrame (`album_df`) by extracting album IDs, associated persons, and photo IDs from the dictionary.

3. **Create and Cleanse DataFrame**: It creates `album_df` from the prepared data and then applies the `cleanse_album_df` function to remove redundant albums and consolidate photo IDs.

The final output is `cleansed_album_df`, which contains the cleaned and consolidated album data.

### ALBUM FORMATION

In [39]:
def create_album(album_id, photos, df, pengguna_user):
 
    hasil_path = f'./result gallery {pengguna_user}'
    album_path = f'{hasil_path}/{album_id}'
    os.makedirs(album_path, exist_ok=True)
    
    for photo_id in photos:

        photo_name = df.loc[df['photo_id'] == photo_id, 'photo_name'].values
        if len(photo_name) > 0:
            photo_name = photo_name[0]
            photo_path = f'./{pengguna_user}/{photo_name}'
            shutil.copy(photo_path, f'{album_path}/{photo_name}')
    
    return album_path

def process_photos(df, pengguna_user):

    hasil_path = f'./result gallery {pengguna_user}'
    os.makedirs(hasil_path, exist_ok=True)

    all_photo_ids = df['photo_id'].unique()
    album_photo_ids = [photo_id for ids in cleansed_album_df['photo_id'] for photo_id in ids]

    for photo_id in all_photo_ids:
        if photo_id not in album_photo_ids:
            photo_name = df.loc[df['photo_id'] == photo_id, 'photo_name'].values[0]
            photo_path = f'./{pengguna_user}/{photo_name}'
            shutil.copy(photo_path, f'{hasil_path}/{photo_name}')


The code provides functions to organize and copy photos into albums. The `create_album` function creates a directory for each album and copies photos into it based on their IDs from a DataFrame. The `process_photos` function handles photos not included in any album by copying them into a separate directory for the user. Both functions use paths constructed with the user's identifier and ensure that the necessary directories exist before copying files.

In [None]:
import time
albums = []

start_time = time.time()

process_photos(df_photos, pengguna_user)

for _, row in cleansed_album_df.iterrows():
    album_id = row['album_id']
    photos = row['photo_id']
    album_path = create_album(album_id, photos, df_photos, pengguna_user)
    albums.append(album_path)

end_time = time.time()

print("Album successfully created in:", albums)
print(f"Computation Time: {end_time - start_time:.2f} second")


The code snippet processes photos by first executing `process_photos` to handle images not included in any album. Then, it iterates through the `cleansed_album_df` DataFrame to create individual albums using `create_album`, which organizes photos into respective directories. The paths of these created albums are collected in the `albums` list. Finally, the script prints the locations of the created albums and the total computation time.