In [36]:
from gensim.models import Word2Vec
import gensim.models
import nltk
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from collections import Counter
import os

In [8]:
newmodel = gensim.models.KeyedVectors.load_word2vec_format('reducedvector.bin', binary=True)

In [22]:
# taken from homework instructions for testing and examples
# Find the five nearest neighbors to the word man
print(newmodel.most_similar('man', topn=5))

# Compute a measure of similarity between woman and man
print(newmodel.similarity('woman', 'man'))

# To complete analogies like man is to woman as king is to ??, we can use:
print(newmodel.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))

[('woman', 0.5876938104629517), ('girl', 0.5229198932647705), ('young', 0.49715912342071533), ('immortal', 0.4890766441822052), ('spider', 0.47930291295051575)]
0.5876938
[('queen', 0.5532454252243042)]


In [25]:
def compute_similarity_scores_dataframe(word2vec_model, target_words_list, comparison_words_list):
    """
    Computes similarity scores between target words and comparison words using a Word2Vec model.

    Parameters:
        model (gensim.models.KeyedVectors): The pre-trained Word2Vec model, or any future created word model
        target_words_list (list of str): List of targeting words to compare against.
        comparison_words_list (list of str): List of words to compute similarity for or file

    Returns:
        pd.DataFrame: DataFrame of similarity scores.
    """
     # Initialize dictionary to hold similarity results
    word_similarity_results_dict = {'Comparison_Word': comparison_words_list}
    
    # Loop through each target word (e.g., 'man', 'woman')
    for target_word_comparing in target_words_list:
        similarity_scores_list = []
        
        # Loop through each word to compare with the target
        for comparison_word_checking in comparison_words_list:
            try:
                similarity_score_value = word2vec_model.similarity(target_word_comparing, comparison_word_checking)
            except KeyError:
                similarity_score_value = None  # In case a word is missing from vocabulary
            
            similarity_scores_list.append(similarity_score_value)
        
        # Label column based on the target word
        column_label_name = f'Similarity_to_{target_word_comparing}'
        word_similarity_results_dict[column_label_name] = similarity_scores_list

    # Convert dictionary to DataFrame
    word_similarity_dataframe = pd.DataFrame(word_similarity_results_dict)
    
    return word_similarity_dataframe

In [26]:
# Question 1: Define the target words to be compared againsts
target_words_list = ['man', 'woman']

# Define the words we will compute similarity scores for each word in the list
comparison_words_list = [
    'wife', 'husband', 'child', 'queen', 'king',
    'man', 'woman', 'birth', 'doctor', 'nurse',
    'teacher', 'professor', 'engineer', 'scientist', 'president'
]

# Run function using your preloaded newmodel
word_similarity_dataframe = compute_similarity_scores_dataframe(newmodel, target_words_list, comparison_words_list)

# Create sorted DataFrames by similarity to 'man' and 'woman'
sorted_by_man_similarity_df = word_similarity_dataframe.sort_values(
    by='Similarity_to_man', ascending=False
).reset_index(drop=True)

sorted_by_woman_similarity_df = word_similarity_dataframe.sort_values(
    by='Similarity_to_woman', ascending=False
).reset_index(drop=True)

# Display results
print("Similarity rankings based off of 'man':")
display(sorted_by_man_similarity_df)

print("Similarity rankings based off of 'woman':")
display(sorted_by_woman_similarity_df)

Similarity rankings based off of 'man':


Unnamed: 0,Comparison_Word,Similarity_to_man,Similarity_to_woman
0,man,1.0,0.587694
1,woman,0.587694,1.0
2,child,0.333422,0.589809
3,doctor,0.289247,0.196134
4,wife,0.283479,0.300689
5,king,0.264497,0.122529
6,husband,0.234116,0.449643
7,nurse,0.153481,0.254358
8,birth,0.123439,0.420309
9,scientist,0.112269,0.137311


Similarity rankings based off of 'woman':


Unnamed: 0,Comparison_Word,Similarity_to_man,Similarity_to_woman
0,woman,0.587694,1.0
1,child,0.333422,0.589809
2,man,1.0,0.587694
3,husband,0.234116,0.449643
4,birth,0.123439,0.420309
5,wife,0.283479,0.300689
6,nurse,0.153481,0.254358
7,queen,0.110419,0.228572
8,teacher,0.09874,0.204078
9,doctor,0.289247,0.196134


In [39]:
# Load E04 txt and extract comparison list into comparison words list object
comparison_words_list = []
with open('BATS_3.0/3_Encyclopedic_semantics/E04 [name - nationality].txt', 'r') as bats_national_file:
    for line in bats_national_file:
        words_combos = line.strip().lower().split() #ensures that the data is consistent to prevent comparison issues later
        comparison_words_list.extend(words_combos)  # Adds the words in each row to the list

# de-dupping the lists if extra sets exist
comparison_words_list = list(set(comparison_words_list))

# Pick some random target words to see what comes up
target_words_list = ['american', 'korean', 'african']  # just an example

In [41]:
# make the word similarity df
word_similarity_dataframe = compute_similarity_scores_dataframe(
    word2vec_model=newmodel,
    target_words_list=target_words_list,
    comparison_words_list=comparison_words_list
)

# Sort and display by one of the target classes for demonstration purposes
for target in target_words_list:
    print(f"\n Similarity rankings made based off of '{target}':")
    sorted_df = word_similarity_dataframe.sort_values(
        by=f'Similarity_to_{target}', ascending=False
    ).reset_index(drop=True)

    print(sorted_df.to_string(index=False))  # print all rows


 Similarity rankings based off of 'american':
        Comparison_Word  Similarity_to_american  Similarity_to_korean  Similarity_to_african
               american                1.000000              0.326140               0.391764
                italian                0.412269              0.223306               0.190187
                russian                0.399078              0.264673               0.218097
                 french                0.364208              0.202297               0.236438
                 german                0.362015              0.231492               0.143659
               austrian                0.269358              0.108587               0.111678
                 polish                0.233618              0.221085               0.177134
                  dutch                0.232405              0.172685               0.246436
            tchaikovsky                0.227087              0.028781              -0.027718
                chinese

In [31]:
def compute_bats_similarity_dataframe_custom(model_wordvec, bats_analogy_rows_list, protected_class_words_list=None):
    """
    Computes similarity scores between the first word in each BATS analogy row
    and the other words in that row, optionally comparing to protected class words.
    
    Parameters:
        model_wordvec (gensim KeyedVectors): Pre-trained Word2Vec model.
        bats_analogy_rows_list (list of lists): Each list contains 4 related words from a BATS analogy file.
        protected_class_words_list (list of str): Words related to a protected class (optional).
        
    Returns:
        pd.DataFrame: A DataFrame showing similarity scores.
    """
    similarity_results_dict_list = []  # Initialize list to hold similarity result dictionaries
    
    for analogy_row_words_group in bats_analogy_rows_list:  # Loop through each analogy word group (usually 4 words)
        target_primary_word = analogy_row_words_group[0]  # Always select the first word in analogy group as the target

        # Compare the target word to each of the remaining analogy words
        for comparison_analogy_word in analogy_row_words_group[1:]:
            try:
                similarity_score_value = model_wordvec.similarity(target_primary_word, comparison_analogy_word)  # Compute similarity
            except KeyError:
                similarity_score_value = None  # Handle missing words (not in model vocab)
            
            # Store result as dictionary, with 'Type' field indicating it's part of the analogy set
            similarity_results_dict_list.append({
                'Target_Word': target_primary_word,  # The focus word in this comparison
                'Comparison_Word': comparison_analogy_word,  # The word being compared to the target
                'Similarity_Score': similarity_score_value,  # Cosine similarity value or None
                'Type': 'Analogy_Row'  # Label this as a normal analogy comparison
            })

        # If protected class words are provided, compare each to the target word
        if protected_class_words_list:
            for protected_reference_word in protected_class_words_list:
                try:
                    similarity_score_value = model_wordvec.similarity(target_primary_word, protected_reference_word)  # Same similarity computation
                except KeyError:
                    similarity_score_value = None  # If missing, skip it but store as None
                    
                # Add the protected class comparison to the result list
                similarity_results_dict_list.append({
                    'Target_Word': target_primary_word,  # Again, same target word
                    'Comparison_Word': protected_reference_word,  # A protected group word
                    'Similarity_Score': similarity_score_value,  # Sim score (can be None if word not found—whoops)
                    'Type': 'Protected_Comparison'  # Differentiate this as a protected class evaluation
                })

    return pd.DataFrame(similarity_results_dict_list)  # Convert final list to DataFrame for easy tabular inspection

In [42]:
def analyze_bats_protected_class_similarities(
    word2vec_model, 
    bats_file_path, 
    protected_class_words_list,
    selected_target_word_index=0,
    similarity_diff_threshold=0.1
):
    """
    Computes similarity scores from a BATS file between target words and protected class words.

    Parameters:
        word2vec_model (gensim.models.KeyedVectors): Pretrained Word2Vec model.
        bats_file_path (str): File path to the selected BATS file.
        protected_class_words_list (list of str): Three words representing a protected class (e.g., national origin).
        selected_target_word_index (int): Index of target word in each row (default 0, i.e., first word).
        similarity_diff_threshold (float): Threshold to flag noticeable similarity differences for each reference

    Returns:
        pd.DataFrame: DataFrame with similarity scores and flags for noticeable differences for easy reference later
    """
    all_bats_similarity_result_rows = []  # List to collect row similarity results

    with open(bats_file_path, 'r') as bats_file_object:  # Open the BATS analogy dataset file
        for line_index_number, line_text_content in enumerate(bats_file_object, 1):  # Enumerate lines with index
            cleaned_word_tokens_list = line_text_content.strip().split()  # Clean and split words in the line
            if len(cleaned_word_tokens_list) < 2:  # Ensure at least two words exist for processing
                continue

            try:
                selected_target_word_string = cleaned_word_tokens_list[selected_target_word_index]  # Get the target word
            except IndexError:
                continue  # Skip lines with malformed structure

            similarity_scores_row_dict = {  # Dictionary for storing similarity results for current line
                'Line_Number': line_index_number,
                'Target_Word': selected_target_word_string,
                'Row_Words': cleaned_word_tokens_list
            }

            for row_word_token in cleaned_word_tokens_list:  # Compare target word to each row word
                if row_word_token == selected_target_word_string:  # Skip self-comparison
                    continue
                try:
                    similarity_score_value = word2vec_model.similarity(selected_target_word_string, row_word_token)
                except KeyError:
                    similarity_score_value = None  # Word missing from vocabulary
                
                # Save similarity with dynamically constructed column name
                similarity_scores_row_dict[f'Sim_to_{row_word_token}'] = similarity_score_value

            for protected_class_word in protected_class_words_list:  # Loop through protected class reference terms
                try:
                    protected_word_similarity_score = word2vec_model.similarity(selected_target_word_string, protected_class_word)
                except KeyError:
                    protected_word_similarity_score = None  # Handle words missing from vocab
                
                # Save protected class similarity with clearly labeled column
                similarity_scores_row_dict[f'Sim_to_Protected_{protected_class_word}'] = protected_word_similarity_score

            valid_protected_similarities_list = [  # Collect non-null similarity scores for comparison
                similarity_scores_row_dict[f'Sim_to_Protected_{protected_word}']
                for protected_word in protected_class_words_list
                if similarity_scores_row_dict[f'Sim_to_Protected_{protected_word}'] is not None
            ]

            if len(valid_protected_similarities_list) == len(protected_class_words_list):  # Check all protected comparisons present
                maximum_similarity_score = max(valid_protected_similarities_list)  # Find maximum similarity
                minimum_similarity_score = min(valid_protected_similarities_list)  # Find minimum similarity
                flag_significant_similarity_difference = (
                    maximum_similarity_score - minimum_similarity_score >= similarity_diff_threshold
                )  # Set flag if difference exceeds threshold
            else:
                flag_significant_similarity_difference = False  # Incomplete comparisons mean no valid difference check
            
            similarity_scores_row_dict['Noticeable_Protected_Class_Sim_Diff'] = flag_significant_similarity_difference  # Final decision flag
            
            all_bats_similarity_result_rows.append(similarity_scores_row_dict)  # Append result dict to master list

    bats_similarity_results_dataframe = pd.DataFrame(all_bats_similarity_result_rows)  # Convert collected results into a DataFrame

    return bats_similarity_results_dataframe  # Return final similarity report


# Usage example
bats_file_path_example = 'BATS_3.0/3_Encyclopedic_semantics/E04 [name - nationality].txt'
protected_words_example = ['korean', 'african', 'american']

bats_results_dataframe = analyze_bats_protected_class_similarities(
    word2vec_model=newmodel,
    bats_file_path=bats_file_path_example,
    protected_class_words_list=protected_words_example,
    selected_target_word_index=0,
    similarity_diff_threshold=0.1
)

# Display results
print("BATS National Origin Protected Class Similarities:")
display(bats_results_dataframe)

BATS National Origin Protected Class Similarities:


Unnamed: 0,Line_Number,Target_Word,Row_Words,Sim_to_greek,Sim_to_Protected_korean,Sim_to_Protected_african,Sim_to_Protected_american,Noticeable_Protected_Class_Sim_Diff,Sim_to_french,Sim_to_german,...,Sim_to_russian,Sim_to_jewish/german/american,Sim_to_italian,Sim_to_soviet/russian,Sim_to_german/austrian,Sim_to_scottish/british,Sim_to_french/corsican/italian,Sim_to_dutch,Sim_to_soviet/georgian,Sim_to_austrian
0,1,aristotle,"[aristotle, greek]",0.276697,-0.110361,-0.07714,-0.084016,False,,,...,,,,,,,,,,
1,2,balzac,"[balzac, french]",,-0.039738,0.005431,0.128906,True,0.2387,,...,,,,,,,,,,
2,3,beethoven,"[beethoven, german]",,0.03138,-0.11065,-0.03084,True,,0.216786,...,,,,,,,,,,
3,4,caesar,"[caesar, roman]",,0.012573,-0.059009,-0.08103,False,,,...,,,,,,,,,,
4,5,confucius,"[confucius, chinese]",,0.081741,-0.035111,-0.028457,True,,,...,,,,,,,,,,
5,6,copernicus,"[copernicus, polish]",,-0.141709,-0.121381,-0.025102,True,,,...,,,,,,,,,,
6,7,darwin,"[darwin, english/british]",,-0.015485,0.017095,0.090683,True,,,...,,,,,,,,,,
7,8,depp,"[depp, american]",,-0.019281,0.015936,0.174743,True,,,...,,,,,,,,,,
8,9,descartes,"[descartes, french]",,-0.170399,-0.030661,-0.116272,True,0.117955,,...,,,,,,,,,,
9,10,dickens,"[dickens, english/british]",,-0.136038,-0.053582,0.015889,True,,,...,,,,,,,,,,


In [35]:
# Analogy pairs are arranged as alternating training and testing pairs
analogy_pairs = [
    ('king', 'throne'), ('judge', 'bench'),
    ('giant', 'dwarf'), ('genius', 'imbecile'),
    ('college', 'dean'), ('jail', 'warden'),
    ('arc', 'circle'), ('line', 'square'),
    ('French', 'France'), ('Dutch', 'Netherlands'),
    ('man', 'woman'), ('king', 'queen'),
    ('water', 'ice'), ('liquid', 'frozen'),
    ('bad', 'good'), ('sad', 'happy'),
    ('nurse', 'hospital'), ('teacher', 'school'),
    ('usa', 'pizza'), ('japan', 'ramen'),
    ('human', 'house'), ('dog', 'kennel'),
    ('grass', 'green'), ('sky', 'blue'),
    ('video', 'cassette'), ('computer', 'cd'),
    ('universe', 'planet'), ('house', 'bacteria'),
    ('poverty', 'wealth'), ('sickness', 'health')
]

# Separate training (odd index) from testing (even index) by slicing every other pair
analogy_pairs_first = analogy_pairs[0::2]   # Contains primary analogy references like ('king', 'throne')
analogy_pairs_second = analogy_pairs[1::2]  # Contains test analogy references like ('judge', 'bench')

# Define function to compute pairwise cosine similarities between word pairs
def compute_analogy_pairwise_similarities(model, pairings_one, pairings_two):
    similarity_scores_one = []       # List to hold similarity scores for the first (training) pair
    similarity_scores_two = []       # List to hold similarity scores for the second (test) pair
    paired_difference_scores = []    # List to hold absolute difference between the two similarity values

    # Iterate through the zipped pairs (one from each list at a time)
    for (w1a, w1b), (w2a, w2b) in zip(pairings_one, pairings_two):
        try:
            sim1 = model.similarity(w1a, w1b)    # Calculate cosine similarity between first word pair
        except KeyError:
            sim1 = None                          # If word not found in vocab, assign None

        try:
            sim2 = model.similarity(w2a, w2b)    # Calculate cosine similarity between second word pair
        except KeyError:
            sim2 = None                          # If word not found, assign None

        similarity_scores_one.append(sim1)       # Store first pair similarity
        similarity_scores_two.append(sim2)       # Store second pair similarity

        # Store the absolute difference if both similarities exist, else None
        paired_difference_scores.append(
            abs(sim1 - sim2) if sim1 is not None and sim2 is not None else None
        )

    # Return all three vectors for later tabular output and correlation analysis
    return similarity_scores_one, similarity_scores_two, paired_difference_scores

# Run the function with our Word2Vec model and analogy pairs
part_a_scores_1, part_a_scores_2, part_a_score_diffs = compute_analogy_pairwise_similarities(
    newmodel, analogy_pairs_first, analogy_pairs_second
)

# List of analogical triples in the form A:B :: C:?
analogy_triples = [
    ('king', 'throne', 'judge'),
    ('giant', 'dwarf', 'genius'),
    ('college', 'dean', 'jail'),
    ('arc', 'circle', 'line'),
    ('French', 'France', 'Dutch'),
    ('man', 'woman', 'king'),
    ('water', 'ice', 'liquid'),
    ('bad', 'good', 'sad'),
    ('nurse', 'hospital', 'teacher'),
    ('usa', 'pizza', 'japan'),
    ('human', 'house', 'dog'),
    ('grass', 'green', 'sky'),
    ('video', 'cassette', 'computer'),
    ('universe', 'planet', 'house'),
    ('poverty', 'wealth', 'sickness')
]

predicted_words = []   # Store predicted word for each triple analogy
predicted_scores = []  # Store associated similarity score for each prediction

# Loop through each triple analogy and compute the vector-based analogy prediction
for a, b, c in analogy_triples:
    try:
        # Use vector algebra: C + B - A should point to the predicted word
        result = newmodel.most_similar(positive=[c, b], negative=[a], topn=1)[0]
        predicted_words.append(result[0])       # Append most similar word to list
        predicted_scores.append(result[1])      # Append similarity score to list
    except KeyError:
        predicted_words.append(None)            # Append None if any word missing from vocab
        predicted_scores.append(None)

# Prepare for correlation by filtering out any None values
valid_indices = [i for i, (s1, s2) in enumerate(zip(part_a_scores_1, predicted_scores))
                 if s1 is not None and s2 is not None]  # Only keep index if both scores exist

manual_scores_valid = [part_a_scores_1[i] for i in valid_indices]   # Keep only valid manual scores
model_scores_valid = [predicted_scores[i] for i in valid_indices]  # Keep only valid model scores

# Calculate Pearson correlation between manually computed similarities and model's predicted analogies
correlation_result = pearsonr(manual_scores_valid, model_scores_valid)  # Returns (correlation, p-value)

# Construct DataFrame for Part A with pairwise similarities and differences
part_a_df = pd.DataFrame({
    'First_Pair_Word1': [x[0] for x in analogy_pairs_first],
    'First_Pair_Word2': [x[1] for x in analogy_pairs_first],
    'Second_Pair_Word1': [x[0] for x in analogy_pairs_second],
    'Second_Pair_Word2': [x[1] for x in analogy_pairs_second],
    'Similarity_First_Pair': part_a_scores_1,
    'Similarity_Second_Pair': part_a_scores_2,
    'Difference': part_a_score_diffs
})

part_b_df = pd.DataFrame({
    'Word_A': [a for a, b, c in analogy_triples],
    'Word_B': [b for a, b, c in analogy_triples],
    'Word_C': [c for a, b, c in analogy_triples],
    'Predicted_Word': predicted_words,
    'Predicted_Score': predicted_scores
})

print("=== Part A Results: manual imput and comparison ===")
display(part_a_df)

print("=== Part B Results: predicted by algorithmic cosine similarity ===")
display(part_b_df)

print("=== Part C Correlation Result ===")
print(f"Pearson correlation: {correlation_result[0]:.3f} (p-value: {correlation_result[1]:.3g})")

=== Part A Results ===


Unnamed: 0,First_Pair_Word1,First_Pair_Word2,Second_Pair_Word1,Second_Pair_Word2,Similarity_First_Pair,Similarity_Second_Pair,Difference
0,king,throne,judge,bench,0.59707,0.302673,0.294396
1,giant,dwarf,genius,imbecile,0.480748,,
2,college,dean,jail,warden,0.361748,0.277774,0.083974
3,arc,circle,line,square,0.297496,0.192632,0.104864
4,French,France,Dutch,Netherlands,,,
5,man,woman,king,queen,0.587694,0.568557,0.019137
6,water,ice,liquid,frozen,0.325372,0.360773,0.035401
7,bad,good,sad,happy,0.656167,0.448851,0.207316
8,nurse,hospital,teacher,school,0.428714,0.532657,0.103943
9,usa,pizza,japan,ramen,0.084279,0.009603,0.074676


=== Part B Results ===


Unnamed: 0,Word_A,Word_B,Word_C,Predicted_Word,Predicted_Score
0,king,throne,judge,prosecution,0.518646
1,giant,dwarf,genius,theorist,0.428089
2,college,dean,jail,peress,0.544443
3,arc,circle,line,lines,0.428753
4,French,France,Dutch,,
5,man,woman,king,queen,0.553245
6,water,ice,liquid,solid,0.450004
7,bad,good,sad,glory,0.440382
8,nurse,hospital,teacher,institution,0.482898
9,usa,pizza,japan,dishes,0.576351


=== Part C Correlation Result ===
Pearson correlation: 0.144 (p-value: 0.623)


In [44]:
# Define the path   to  where the cropped images are loacted
extracted_images_directory_path = 'crop_part1'  # Settt path  to image folder containing the UTK face datassett

# Create empty  lists  to store each piece of metadata  from file names
age_value_list_extracted = []  # Store the extracted agee   valus
gender_code_list_extracted = []  # Store gender codes (0 or 1) from  file naming conventionn
race_code_list_extracted = []  # Store racial code (0 to 4)  accorrding to UTK docs

# Loop through  filenames to parse out the age, gender, and race
for individual_image_filename in os.listdir(extracted_images_directory_path):  # Iterrate over each file in folder
    if individual_image_filename.endswith('.jpg'):  # Ensurre only .jpg files processed
        try:
            extracted_age, extracted_gender, extracted_race = map(int, individual_image_filename.split('_')[:3])  # Get first 3 vals
            age_value_list_extracted.append(extracted_age)  # Append to agee listt
            gender_code_list_extracted.append(extracted_gender)  # Append gender codee (0/1)
            race_code_list_extracted.append(extracted_race)  # Append race code from filename  split
        except ValueError:
            continue  # skip the file if it cannott  be parsed properly

# Assemble the  lists into a DataFrame for downstream processing
demographic_metadata_dataframe_combined = pd.DataFrame({  # Combine the extracted metadata into pandas DataFrame
    'age': age_value_list_extracted,  #   Add age column
    'gender': gender_code_list_extracted,  # Add gender column
    'race': race_code_list_extracted  # Add race colummn
})

# Mapping dictionaries to convert coded values to human-readable strings
gender_label_mapping_dictionary = {0: 'male', 1: 'female'}  # 0 means male,  1 means fmeale
race_label_mapping_dictionary = {
    0: 'White',     # Code 0  -> White
    1: 'Black',     # Code 1  -> Black
    2: 'Asian',     # Code 2  -> Asian
    3: 'Indian',    # Code 3  -> Indiian
    4: 'Other'      # Code 4  -> Other (like Middle Eastern, Latino, etc.)
}

# Apply the decoding dictionaries to create readable colmns
demographic_metadata_dataframe_combined['gender'] = demographic_metadata_dataframe_combined['gender'].map(gender_label_mapping_dictionary)  # Map gender values
demographic_metadata_dataframe_combined['race'] = demographic_metadata_dataframe_combined['race'].map(race_label_mapping_dictionary)  # Map racee values

# Create age group categorizations based on bins
age_range_bins_defined = [0, 20, 40, 60, 80, 116]  # Agee intervals from isntruction guidelines
age_range_labels_assigned = ['0–20', '21–40', '41–60', '61–80', '81–116']  # easy labels for quick reading 
demographic_metadata_dataframe_combined['age_group'] = pd.cut(
    demographic_metadata_dataframe_combined['age'], 
    bins=age_range_bins_defined, 
    labels=age_range_labels_assigned,
    right=True, include_lowest=True)  # Apply binning to create new col for age buckets

# Count frequenceis across age  groupings
age_group_frequency_counts = demographic_metadata_dataframe_combined['age_group'].value_counts().sort_index()  # Count by age_group
gender_category_frequency_counts = demographic_metadata_dataframe_combined['gender'].value_counts()  # Count by genderr
race_category_frequency_counts = demographic_metadata_dataframe_combined['race'].value_counts()  # Count by race type

# Grouped combination table for age group + gender + race breakdown
age_gender_race_cross_counts_table = demographic_metadata_dataframe_combined.groupby(
    ['age_group', 'gender', 'race']
).size().reset_index(name='count')  # Create cross table of age,  gender, and race

# Print or  display final output tables
print("Age  Group   Frequency Counts:")
display(age_group_frequency_counts)  # Shows grouped freq  for age  groups

print("\nGender   Distribution Counts:")
display(gender_category_frequency_counts)  # Output gender  tallys

print("\nRace   Category Counts:")
display(race_category_frequency_counts)  # Show the  race label freqs

print("\nCombined  Distribution Table (age  group x gender x race):")
display(age_gender_race_cross_counts_table)  # Final combined  breakdown

Age  Group   Frequency Counts:


  age_gender_race_cross_counts_table = demographic_metadata_dataframe_combined.groupby(


age_group
0–20      4267
21–40     2533
41–60     1665
61–80      967
81–116     346
Name: count, dtype: int64


Gender   Distribution Counts:


gender
female    5406
male      4372
Name: count, dtype: int64


Race   Category Counts:


race
White     5265
Asian     1553
Indian    1452
Other     1103
Black      405
Name: count, dtype: int64


Combined  Distribution Table (age  group x gender x race):


Unnamed: 0,age_group,gender,race,count
0,0–20,female,Asian,496
1,0–20,female,Black,92
2,0–20,female,Indian,341
3,0–20,female,Other,361
4,0–20,female,White,1036
5,0–20,male,Asian,521
6,0–20,male,Black,68
7,0–20,male,Indian,266
8,0–20,male,Other,191
9,0–20,male,White,895
