In [1]:
import random
import os
import pandas as pd
import re

# Folder Paths
# List of folder paths
folder_paths = ['/Users/gb4818/OneDrive - Imperial College London/Rev-res-in/100k_iter/results_w50_15dp']
# Define folder labels
folder_labels = [15]
# Extract the label from the folder_labels list
label = folder_labels[0]

#########################################################################################
### GENETIC DIVERSITY FUNCTIONS DEFINITION
#Creating a function to calculate genetic diversity among pairs
# I will use this one to calculate the average diversity of the sample
def average_diversity_simulation(genomes, num_pairs):
    # Ensure num_pairs is not greater than the length of genomes
    num_pairs = min(num_pairs, len(genomes) // 2)
    
    # Select random pairs of indices
    random_indices = random.sample(range(len(genomes)), 2 * num_pairs)

    # Initialize a counter for differences
    differences_count = 0

    # Compare strings in random pairs
    for i in range(0, len(random_indices), 2):
        index1, index2 = random_indices[i], random_indices[i + 1]
        string1, string2 = genomes.iloc[index1], genomes.iloc[index2]

        # Compare strings and count differences
        differences_count = differences_count + sum(c1 != c2 for c1, c2 in zip(string1, string2))

    # Calculate the average differences per pair
    average_differences = differences_count / num_pairs

    return average_differences

# Creating a function to create and store diversity values and return them in a table, 
# from this table we will calculate stats for each sample
def diversity_table(genomes, num_pairs):
    # Select random pairs of indices
    random_indices = random.sample(range(len(genomes)), 2 * num_pairs)

    # Initialize a counter for differences
    differences_list = []

    # Compare strings in random pairs
    for i in range(0, len(random_indices), 2):
        index1, index2 = random_indices[i], random_indices[i + 1]
        string1, string2 = genomes.iloc[index1], genomes.iloc[index2]

        # Compare strings and count differences
        differences_count = sum(c1 != c2 for c1, c2 in zip(string1, string2))

        # Append to the list
        differences_list.append((differences_count))

    return differences_list

##############################
### LOGS GENETIC DIVERSITY ###
##############################

# Create a list to store individual DataFrames
dataframes = []

for i, folder_path in enumerate(folder_paths):
    # Create a list to store all the DataFrames for the current folder
    dfs = []
    
    # Loop through each file in the folder and read it into a DataFrame
    for filename in os.listdir(folder_path):
        match = re.match(r'REvoSim_individuals_data_.*\.txt', filename)
        if match:
            file_path = os.path.join(folder_path, filename)

            # Read the DataFrame from the file
            df = pd.read_csv(file_path, sep=',', header=0, skiprows=12,  usecols=[0])
            genomes = df['Genome']
            # keep the first 32 bits from each genome
            #genomes = df['Genome'].str[:32]
            #remove the first 32 and keep the last
            #genomes = df['Genome'].str[32:]
            
            # Calculate genetic diversity and store in the list
            diversity_list = diversity_table(genomes, len(genomes) // 2)

            # Create a DataFrame with an additional 'File' column
            div_df = pd.DataFrame({'Diversity': diversity_list, 'dp': folder_labels[i], 'Simulation': filename})
            dfs.append(div_df)

    # Concatenate all DataFrames in the list
    combined_df_folder = pd.concat(dfs, ignore_index=True)
    dataframes.append(combined_df_folder)

# Concatenate all DataFrames from different folders into a single DataFrame
diversity = pd.concat(dataframes, ignore_index=True)
# Let's create a table with average diversity for each simulation
average_div_df = diversity.groupby(['dp','Simulation'])['Diversity'].mean().reset_index()
average_div_df.to_csv(f'w50_{label}dp_genetic_diversity.csv')
