#  Clustering and Identifying Crossbreeds

## Data Preparation:

Calculate the IBS matrix for the entire dataset using plink:

```bash
plink --chr-set 26 no-xy no-mt --allow-no-sex \
   --bfile SMARTER-OA-OAR3-top-0.4.10/SMARTER-OA-OAR3-top-0.4.10 \
   --geno 0.1 --distance square gz ibs \
   --out SMARTER-OA-OAR3-top-0.4.10/SMARTER-OA-OAR3-top-0.4.10
```

In [None]:
import umap
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.cluster import KMeans, DBSCAN
from sklearn.impute import SimpleImputer

from src.features.utils import get_interim_dir

In [None]:
ibs_data = pd.read_table(get_interim_dir() / 'SMARTER-OA-OAR3-top-0.4.10/SMARTER-OA-OAR3-top-0.4.10.mibs.gz', header=None)
assignment_df = pd.read_table(get_interim_dir() / 'SMARTER-OA-OAR3-top-0.4.10/SMARTER-OA-OAR3-top-0.4.10.mibs.id', header=None, names=['breed', "sample"])
distance_df = 1 - ibs_data

# assign names to the distance matrix
individuals = assignment_df["sample"].tolist()
distance_df.index = individuals
distance_df.columns = individuals

In [None]:
# Now we have the distance matrix and the corresponding sample IDs
print(f"Distance matrix shape: {distance_df.shape}")
print(f"Number of samples: {len(individuals)}")

Impute missing data:

In [None]:
# Check for missing values using scikit-learn's SimpleImputer
imputer = SimpleImputer(strategy='mean')

# Impute missing values in the distance matrix
distance_df_imputed = imputer.fit_transform(distance_df)

# Convert the imputed NumPy array back to a DataFrame
distance_df_imputed = pd.DataFrame(distance_df_imputed, columns=distance_df.columns, index=distance_df.index)

# Check the shape of the imputed distance matrix
print(f"Distance matrix shape after imputation: {distance_df_imputed.shape}")
distance_df_imputed.head()

## K-means clustering

In [None]:
# Number of clusters - set based on the number of breeds or experiment with different values
n_clusters = len(assignment_df['breed'].unique())

# Step 1: Apply K-means clustering on the distance matrix
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
assignment_df['cluster'] = kmeans.fit_predict(distance_df_imputed)

# Step 2: Relabel clusters based on the most frequent breed in each cluster
def relabel_clusters(df):
    cluster_to_breed = {}
    for cluster in df['cluster'].unique():
        # Find the most common breed in each cluster
        most_common_breed = df[df['cluster'] == cluster]['breed'].mode()[0]
        cluster_to_breed[cluster] = most_common_breed

    # Map the cluster numbers to their corresponding breed labels
    df['predicted_kmeans'] = df['cluster'].map(cluster_to_breed)
    return df

# Apply the relabeling function
assignment_df = relabel_clusters(assignment_df)

# Step 3: Compare the predicted breed to the actual declared breed
mismatch = assignment_df[assignment_df['breed'] != assignment_df['predicted_kmeans']]
print(f"Number of mismatches: {len(mismatch)}")
mismatch

## UMAP for Dimensionality Reduction and Clustering

Using UMAP and DBSCAN to cluster the data:

In [None]:
# Step 1: Apply UMAP for dimensionality reduction (e.g., reducing to 2D or 3D for clustering)
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean')
umap_results = umap_model.fit_transform(distance_df_imputed)

# Step 2: Use a clustering algorithm, e.g., DBSCAN, to cluster individuals in the UMAP-reduced space
dbscan = DBSCAN(eps=0.1, min_samples=5)
dbscan_labels = dbscan.fit_predict(umap_results)

# Add UMAP and DBSCAN results to the DataFrame
assignment_df['umap-1'] = umap_results[:, 0]
assignment_df['umap-2'] = umap_results[:, 1]
assignment_df['cluster'] = dbscan_labels

# Step 3: Relabel clusters based on the most frequent breed in each cluster
def relabel_clusters(df):
    cluster_to_breed = {}
    for cluster in df['cluster'].unique():
        # Find the most common breed in each cluster
        most_common_breed = df[df['cluster'] == cluster]['breed'].mode()[0]
        cluster_to_breed[cluster] = most_common_breed

    # Map the cluster numbers to their corresponding breed labels
    df['predicted_dbscan'] = df['cluster'].map(cluster_to_breed)
    return df

# Step 3: Compare clusters with declared breeds to identify crossbreeds
assignment_df = relabel_clusters(assignment_df)

# Step 3: Compare the predicted breed to the actual declared breed
mismatch = assignment_df[assignment_df['breed'] != assignment_df['predicted_dbscan']]
print(f"Number of mismatches: {len(mismatch)}")
mismatch

Plotting mislabelled samples for DBSCAN:

In [None]:
# Plot UMAP results
plt.figure(figsize=(10, 6))

# plot all samples in gray
plt.scatter(assignment_df['umap-1'], assignment_df['umap-2'], color='gray', alpha=0.05, s=10, label='_nolegend_')

# Highlight cluster mismatches
plt.scatter(mismatch['umap-1'], mismatch['umap-2'], color='red', marker='x', label='Potential Crossbreeds')

plt.title('UMAP Clustering with Mismatched Individuals Highlighted')
plt.xlabel('UMAP-1')
plt.ylabel('UMAP-2')
plt.legend()
plt.show()

In [None]:
# Generate a color palette with enough distinct colors
colors = px.colors.qualitative.Set3  # You can choose a palette from plotly.colors.qualitative
num_colors = len(colors)

# Create a list of marker symbols to be used
markers = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up', 'triangle-down', 'star', 'pentagon']

# Assign colors and markers to each breed
assignment_df['color'] = assignment_df['breed'].apply(lambda x: colors[hash(x) % num_colors])
assignment_df['marker'] = assignment_df['breed'].apply(lambda x: markers[hash(x) % len(markers)])

# Create a plotly scatter plot with custom markers and colors
fig = go.Figure()

# Plot each breed with its assigned color and marker
for breed in assignment_df['breed'].unique():
    breed_data = assignment_df[assignment_df['breed'] == breed]
    fig.add_trace(go.Scatter(
        x=breed_data['umap-1'],
        y=breed_data['umap-2'],
        mode='markers',
        marker=dict(
            color=breed_data['color'],
            symbol=breed_data['marker'],
            size=10,
            opacity=0.7
        ),
        name=breed,
        text=breed_data['breed'],
        hoverinfo='text'
    ))

# Set the figure size and display the plot
fig.update_layout(
    title=f"UMAP Clustering with {len(assignment_df['breed'].unique())} Breeds",
    width=1000,
    height=800,
    showlegend=False  # Hide the legend to avoid clutter
)

fig.show()
