## calculate the intra-*breed* distances and performing outlier analysis.

### Detailed Procedure

1. **Intra-*breed* Distance Calculation**:
   - For each *breed*, we extract the distances between individuals belonging to that same group.
   - We then calculate the mean \($\mu$\) and the standard deviation \($\sigma$\) of the distances between individuals of the same *breed*.
   
2. **Identification of Potential *Crossbreeds***:
   - For each individual, we calculate the average distance between them and other members of their *breed*.
   - If an individual’s average distance from their *breed* exceeds the overall intra-*breed* distance mean (for example, above \($\mu + 2\sigma $\)), then that individual is considered a potential *crossbreed*.

3. **Step-by-Step Algorithm**:
   - **Input**: Distance matrix \( D \), individuals’ breed assignments.
   - **Output**: List of individuals potentially *crossbreeds* or assigned to the wrong *breed*.

In [1]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from src.features.utils import get_interim_dir

In [2]:
ibs_data = pd.read_table(get_interim_dir() / 'SMARTER-OA-OAR3-top-0.4.10/SMARTER-OA-OAR3-top-0.4.10.mibs.gz', header=None)
sample_names = pd.read_table(get_interim_dir() / 'SMARTER-OA-OAR3-top-0.4.10/SMARTER-OA-OAR3-top-0.4.10.mibs.id', header=None, names=['breed', "sample"])
distance_matrix = 1 - ibs_data

# assign names to the distance matrix
individuals = sample_names["sample"].tolist()
distance_matrix.index = individuals
distance_matrix.columns = individuals

In [None]:
# Now we have the distance matrix and the corresponding sample IDs
print(f"Distance matrix shape: {distance_matrix.shape}")
print(f"Number of samples: {len(individuals)}")

In [4]:
# Function to calculate intra-breed distances using an assignment DataFrame
def calculate_intra_breed_stats_with_assignments(distance_df, assignment_df):
    unique_breeds = assignment_df['breed'].unique()
    breed_stats = {}

    for breed in unique_breeds:
        # Get the samples for a given breed
        breed_samples = assignment_df[assignment_df['breed'] == breed]['sample'].values
        breed_distances = distance_df.loc[breed_samples, breed_samples].values

        # Extract only values above the diagonal (distances between distinct individuals)
        upper_triangle_indices = np.triu_indices_from(breed_distances, k=1)
        intra_breed_distances = breed_distances[upper_triangle_indices]

        # Calculate mean and standard deviation
        mean_distance = np.mean(intra_breed_distances)
        std_distance = np.std(intra_breed_distances)

        breed_stats[breed] = {
            'mean_distance': mean_distance,
            'std_distance': std_distance
        }

    return breed_stats

# Function to identify potential crossbreeds using the assignment DataFrame
def identify_outliers_with_assignments(distance_df, assignment_df, breed_stats, threshold=2):
    outliers = []
    for i, row in assignment_df.iterrows():
        ind = row['sample']
        breed = row['breed']

        # Get other samples of the same breed (excluding the current individual)
        breed_samples = assignment_df[(assignment_df['breed'] == breed) & (assignment_df['sample'] != ind)]['sample'].values
        distances_to_breed = distance_df.loc[ind, breed_samples].values

        # Calculate the mean distance to other individuals of the same breed
        mean_distance_to_breed = np.mean(distances_to_breed)

        # Compare with the intra-breed mean and standard deviation
        mean_intra_breed = breed_stats[breed]['mean_distance']
        std_intra_breed = breed_stats[breed]['std_distance']

        if mean_distance_to_breed > mean_intra_breed + threshold * std_intra_breed:
            outliers.append(ind)

    return outliers

In [None]:
# Calculate the intra-breed statistics using the assignment DataFrame
breed_stats_with_assignments = calculate_intra_breed_stats_with_assignments(distance_matrix, sample_names)

# Identify potential crossbreeds with the assignment DataFrame
outliers_with_assignments = identify_outliers_with_assignments(distance_matrix, sample_names, breed_stats_with_assignments)

# Display results
outliers_with_assignments, breed_stats_with_assignments


In [6]:
# Impute NaN values with the mean of each row
distance_df_imputed = distance_matrix.fillna(distance_matrix.mean())

In [None]:
# Perform t-SNE on the distance matrix with PCA initialization and reduced iterations
tsne = TSNE(
    n_components=2,          # Reduce data to 2 dimensions
    perplexity=40,           # Adjust perplexity for large datasets (try values between 30-50)
    max_iter=1000,             # Number of iterations, increase for better convergence
    learning_rate=200,       # Higher learning rate for large datasets
    init='pca',              # Initialize using PCA for faster convergence
    metric='euclidean',      # Use Euclidean distance metric
    random_state=42,         # For reproducibility
    verbose=True             # Enable verbose output to track progress
)
tsne_results = tsne.fit_transform(distance_df_imputed)

# Create a scatter plot with t-SNE results
plt.figure(figsize=(10, 6))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], label='Individuals')

# Highlight the outliers in red
outliers_tsne = tsne_results[[sample_names[sample_names['sample'] == outlier].index[0] for outlier in outliers_with_assignments]]
plt.scatter(outliers_tsne[:, 0], outliers_tsne[:, 1], color='red', label='Outliers', marker='x')

plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE of Distance Matrix with Outliers Highlighted')
plt.legend()
plt.show()
