In [9]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer


In [3]:
# Cell 2: Load and normalize input k-mer count data
raw_counts = pd.read_csv('modern_counts.tsv', sep='\t').set_index('id')
norm_counts = raw_counts.div(raw_counts['cen10h1'] + raw_counts['cen10h2'], axis=0)

# Remove specific non-informative k-mers
norm_counts = norm_counts.drop(['cen10h1', 'cen10h2', 'ch1-6-7', 'ch4/11-3', 'ch4/11-2', 'ch1-6-7'], axis=1)

# Load haplotype metadata and convert to string labels
metadata = pd.read_csv('metadata.tsv', sep='\t').set_index('id')
flank_cenhaps = metadata[['mat_cenhap', 'pat_cenhap']].dropna().map(lambda x: str(int(x)))

# Filter count data to include only samples with haplotype labels
norm_counts = norm_counts.loc[flank_cenhaps.index]


In [29]:
# Cell 3: Encode haplotype targets and split the dataset using iterative stratification
from skmultilearn.model_selection import iterative_train_test_split

hap_classes = [str(i) for i in range(1, 13)]
mlb = MultiLabelBinarizer(classes=hap_classes)

# Convert each row to list of two haplotypes
Y = mlb.fit_transform(flank_cenhaps.values)
X = norm_counts.values

# Perform stratified multilabel split (80% train, 20% test)
X_train, y_train, X_test, y_test = iterative_train_test_split(X, Y, test_size=0.2)

# Get the indices back
idx_array = np.array(norm_counts.index)
idx_train = idx_array[:len(X_train)]
idx_test = idx_array[len(X_train):]




In [31]:
# Cell 4: Initialize and train the classifier
logreg = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial')
multi_clf = MultiOutputClassifier(logreg)
multi_clf.fit(X_train, y_train)




In [32]:
# Cell 5: Predict and evaluate
y_pred = multi_clf.predict(X_test)

# Decode predicted and true labels
true_labels = mlb.inverse_transform(y_test)
pred_labels = mlb.inverse_transform(y_pred)

# Convert tuples of labels to sorted comma-separated strings
flank_str = [",".join(sorted(lbl)) for lbl in true_labels]
core_str = [",".join(sorted(lbl)) for lbl in pred_labels]

# Create DataFrame with comparison
results_df = pd.DataFrame({
    "flank_cenhap": flank_str,
    "core_cenhap": core_str,
    "correct": [f == p for f, p in zip(flank_str, core_str)]
}, index=idx_test)

# Display the dataframe if desired
results_df.head()

Unnamed: 0,flank_cenhap,core_cenhap,correct
NA12386,2,2,True
HG01190,2,2,True
HG01242,2,2,True
HG01680,2,2,True
HG01695,2,2,True


In [34]:
# Cell 7: Compute and print prediction accuracy
accuracy = results_df["correct"].mean()
print(f"Prediction accuracy: {accuracy:.3f} ({results_df['correct'].sum()} out of {len(results_df)} correct)")


Prediction accuracy: 0.731 (263 out of 360 correct)


In [35]:
results_df.to_csv('linear.tsv', sep='\t')