In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
# load in the data
genotypes = pd.read_csv('tomatoes/Genotypic_data_maf10_min10_291acc.txt', index_col=0)
phenotype = pd.read_csv('tomatoes/phenodata_BLUP_2012.txt', sep='\t', index_col='ID')

In [None]:
def calculate_maf(df):
    # Calculate minor allele frequency
    maf = df.apply(lambda x: min(x.mean(), 1-x.mean()), axis=0)
    return maf

def ld_pruning(df, threshold=0.5):
    # Calculate correlation matrix
    corr = df.corr()
    # Identify pairs of SNPs with correlation greater than the threshold
    # Avoid double removal and self-comparison (i.e., diagonal elements)
    to_remove = set()
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i, j] > threshold:
                to_remove.add(corr.columns[j])
    return df.drop(columns=to_remove)

In [None]:
# Apply MAF filtering
maf = calculate_maf(genotypes)
maf_threshold = 0.01  # Set MAF threshold
genotypes_filtered = genotypes.loc[:, maf >= maf_threshold]

# Apply LD pruning
genotypes_pruned = ld_pruning(genotypes_filtered, threshold=0.5)

In [None]:
# 'genotypes' and 'phenotype' are already loaded and aligned by their indices
# now we check for missing data
imputer = SimpleImputer(strategy='median')
genotypes_imputed = pd.DataFrame(imputer.fit_transform(genotypes_pruned), columns=genotypes_pruned.columns)
phenotype_imputed = pd.DataFrame(imputer.fit_transform(phenotype), columns=phenotype.columns)

# Scale the data
scaler = StandardScaler()
genotypes_scaled = pd.DataFrame(scaler.fit_transform(genotypes_imputed), columns=genotypes_pruned.columns)

# Check for any remaining NaNs or infinities
genotypes_scaled = genotypes_scaled.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
phenotype_scaled = pd.DataFrame(scaler.fit_transform(phenotype_imputed), columns=phenotype.columns)

In [None]:
# Extract the AVGROW97 column from the phenotype dataframe
y = phenotype_scaled['AVGROW97']
# Construct X from the genotype dataframe
X = genotypes_scaled

# Determine the number of samples (rows) and features (columns)
num_samples, num_features = X.shape

print(f"Number of samples: {num_samples}")
print(f"Number of features: {num_features}")

### 7. XGBoost Feature Selection with Hyperparameter Tuning

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
# XGBoost hyperparameters grid
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1],
}

# Randomized search with cross-validation
xgb_random_search = RandomizedSearchCV(xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss'), 
                                       xgb_params, n_iter=10, cv=5, random_state=0)
xgb_random_search.fit(X, y)
selected_features_xgb = np.argsort(xgb_random_search.best_estimator_.feature_importances_)[::-1]
# Save selected features from XGBoost
np.savetxt("selected_features_xgb.txt", selected_features_xgb, fmt='%d')
# XGBoost feature importances
np.savetxt("xgboost_importances.txt", xgb_random_search.best_estimator_.feature_importances_, fmt='%f')