In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load
pan_meta = pd.read_csv("/content/pan_meta.csv")
pan_otutab = pd.read_csv("/content/pan_otutab.csv")

pan_meta.columns = pan_meta.columns.str.strip()
pan_otutab.columns = pan_otutab.columns.str.strip()

print("Columns in pan_otutab:", pan_otutab.columns)
print("Columns in pan_meta:", pan_meta.columns)

if "Subject_ID" not in pan_otutab.columns or "Subject_ID" not in pan_meta.columns:
    print("Column name mismatch. Checking similar names...")


    subject_col_otutab = [col for col in pan_otutab.columns if "subject" in col.lower()]
    subject_col_meta = [col for col in pan_meta.columns if "subject" in col.lower()]

    if subject_col_otutab and subject_col_meta:
        print(f"Renaming {subject_col_otutab[0]} and {subject_col_meta[0]} to 'Subject_ID'")
        pan_otutab.rename(columns={subject_col_otutab[0]: "Subject_ID"}, inplace=True)
        pan_meta.rename(columns={subject_col_meta[0]: "Subject_ID"}, inplace=True)
    else:
        raise ValueError("Could not find matching subject ID columns in both datasets.")

pan_otutab["Subject_ID"] = pan_otutab["Subject_ID"].astype(str)
pan_meta["Subject_ID"] = pan_meta["Subject_ID"].astype(str)

print("Unique Subject_IDs in OTU table:", pan_otutab["Subject_ID"].nunique())
print("Unique Subject_IDs in Metadata:", pan_meta["Subject_ID"].nunique())


common_subjects = set(pan_otutab["Subject_ID"]).intersection(set(pan_meta["Subject_ID"]))
print(f"Number of common Subject_IDs: {len(common_subjects)}")
merged_data = pan_otutab.merge(pan_meta, on="Subject_ID", how="inner")

print("Merged dataset shape:", merged_data.shape)
merged_data = pan_otutab.merge(pan_meta, on="Subject_ID", how="inner")

# Identifying the num parameter
numerical_targets = pan_meta.select_dtypes(include=[np.number]).columns.tolist()
if "Subject_ID" in numerical_targets:
    numerical_targets.remove("Subject_ID")


corr_matrix = merged_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

print("Columns in merged_data:", merged_data.columns.tolist())
print("Numerical targets:", numerical_targets)
print("Dropping columns:", [col for col in numerical_targets if col in merged_data.columns] + ["Subject_ID"])
otu_data = merged_data.drop(columns=numerical_targets + ["Subject_ID"], errors="ignore")
otu_data = otu_data.apply(pd.to_numeric, errors="coerce")
otu_data.fillna(0, inplace=True)

otu_data = merged_data.drop(columns=[col for col in numerical_targets if col in merged_data.columns] + ["Subject_ID"], errors="ignore")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(otu_data)
X_scaled_df = pd.DataFrame(X_scaled, index=otu_data.index, columns=otu_data.columns)
print("Scaling completed successfully.")


common_indices = merged_data.dropna(subset=numerical_targets).index
X_filtered = X_scaled_df.loc[common_indices]
y_filtered = merged_data.loc[common_indices, numerical_targets]

# Feature selection
selector = SelectFromModel(Lasso(alpha=0.1, max_iter=10000))
X_reduced = selector.fit_transform(X_filtered)

#TT Split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_filtered, test_size=0.2, random_state=42)

# LASSO
lasso_results = {}
lasso = Lasso(alpha=0.1, max_iter=10000)

for target in numerical_targets:
    y_target = y_train[target]

    lasso.fit(X_train, y_target)

    y_pred = lasso.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)
    important_features = {otu: coef for otu, coef in zip(X_filtered.columns[selector.get_support()], lasso.coef_) if coef != 0}

    lasso_results[target] = {
        "MSE": mse,
        "Important OTUs": important_features
    }

# rESULTH
for target, result in lasso_results.items():
    print(f"\nLasso Regression for {target}")
    print(f"MSE: {result['MSE']:.4f}")
    print("Important OTUs:")
    for otu, coef in sorted(result['Important OTUs'].items(), key=lambda x: -abs(x[1]))[:10]:  # Show top 10 OTUs
        print(f"  {otu}: {coef:.4f}")

     
Columns in pan_otutab: Index(['OTU_ID', 'Subject-1067', 'Subject-1090', 'Subject-2032',
       'Subject-2065', 'Subject-3026', 'Subject-3061', 'Subject-4010',
       'Subject-4027', 'Subject-5003',
       ...
       'Subject-13029', 'Subject-1034', 'Subject-11023', 'Subject-13030',
       'Subject-1035', 'Subject-1006', 'Subject-1013', 'Subject-1014',
       'Subject-1007', 'Subject-1036'],
      dtype='object', length=1005)
Columns in pan_meta: Index(['Subject_ID', 'Geographical Location', 'Geographical zone in India',
       'Gender', 'AGE  in years', 'Life style pattern', 'BMI',
       'Obese-Non Obese'],
      dtype='object')
Column name mismatch. Checking similar names...
Renaming Subject-1067 and Subject_ID to 'Subject_ID'
Unique Subject_IDs in OTU table: 80
Unique Subject_IDs in Metadata: 1004
Number of common Subject_IDs: 0
Merged dataset shape: (0, 1012)
/usr/local/lib/python3.11/dist-packages/seaborn/matrix.py:202: RuntimeWarning: All-NaN slice encountered
  vmin = np.nanmin(calc_data)
/usr/local/lib/python3.11/dist-packages/seaborn/matrix.py:207: RuntimeWarning: All-NaN slice encountered
  vmax = np.nanmax(calc_data)

Columns in merged_data: ['OTU_ID', 'Subject_ID', 'Subject-1090', 'Subject-2032', 'Subject-2065', 'Subject-3026', 'Subject-3061', 'Subject-4010', 'Subject-4027', 'Subject-5003', 'Subject-5028', 'Subject-5053', 'Subject-6025', 'Subject-6048', 'Subject-6056', 'Subject-7088', 'Subject-7095', 'Subject-8015', 'Subject-8040', 'Subject-9012', 'Subject-9037', 'Subject-9062', 'Subject-10018', 'Subject-10024', 'Subject-10028', 'Subject-10065', 'Subject-11017', 'Subject-11022', 'Subject-11042', 'Subject-12014', 'Subject-12039', 'Subject-12059', 'Subject-12064', 'Subject-12070', 'Subject-13011', 'Subject-13037', 'Subject-14006', 'Subject-14031', 'Subject-14054', 'Subject-14079', 'Subject-7068', 'Subject-3015', 'Subject-1039', 'Subject-1047', 'Subject-1069', 'Subject-2010', 'Subject-2064', 'Subject-3042', 'Subject-3024', 'Subject-3062', 'Subject-4002', 'Subject-4048', 'Subject-4058', 'Subject-5007', 'Subject-5032', 'Subject-5055', 'Subject-6001', 'Subject-6050', 'Subject-6054', 'Subject-7024', 'Subject-7043', 'Subject-7089', 'Subject-8017', 'Subject-8042', 'Subject-8049', 'Subject-9014', 'Subject-9035', 'Subject-9060', 'Subject-1020', 'Subject-10022', 'Subject-10041', 'Subject-10043', 'Subject-10055', 'Subject-11021', 'Subject-11040', 'Subject-11067', 'Subject-12033', 'Subject-12056', 'Subject-12073', 'Subject-13007', 'Subject-13038', 'Subject-14012', 'Subject-14029', 'Subject-14056', 'Subject-14081', 'Subject-7069', 'Subject-3014', 'Subject-1015', 'Subject-1048', 'Subject-1065', 'Subject-1088', 'Subject-2030', 'Subject-2037', 'Subject-2039', 'Subject-3059', 'Subject-4023', 'Subject-4029', 'Subject-4056', 'Subject-5026', 'Subject-5051', 'Subject-5061', 'Subject-6003', 'Subject-6043', 'Subject-6046', 'Subject-7041', 'Subject-7086', 'Subject-8044', 'Subject-8059', 'Subject-9010', 'Subject-1021', 'Subject-10027', 'Subject-10062', 'Subject-11015', 'Subject-11028', 'Subject-11044', 'Subject-12012', 'Subject-12031', 'Subject-12034', 'Subject-12058', 'Subject-12072', 'Subject-13032', 'Subject-13040', 'Subject-14011', 'Subject-14027', 'Subject-14058', 'Subject-14083', 'Subject-7066', 'Subject-3017', 'Subject-1073', 'Subject-1086', 'Subject-2018', 'Subject-2066', 'Subject-2067', 'Subject-3040', 'Subject-3041', 'Subject-3058', 'Subject-4004', 'Subject-4062', 'Subject-5005', 'Subject-5030', 'Subject-6017', 'Subject-6031', 'Subject-6040', 'Subject-6052', 'Subject-7026', 'Subject-7051', 'Subject-7087', 'Subject-8009', 'Subject-8048', 'Subject-9020', 'Subject-9041', 'Subject-9064', 'Subject-9083', 'Subject-1026', 'Subject-10002', 'Subject-10042', 'Subject-10067', 'Subject-11019', 'Subject-11038', 'Subject-11065', 'Subject-12037', 'Subject-12054', 'Subject-12071', 'Subject-13013', 'Subject-13036', 'Subject-14002', 'Subject-14021', 'Subject-14062', 'Subject-14077', 'Subject-14080', 'Subject-7067', 'Subject-1050', 'Subject-1071', 'Subject-2012', 'Subject-2031', 'Subject-2040', 'Subject-2058', 'Subject-3047', 'Subject-3064', 'Subject-3067', 'Subject-4006', 'Subject-4041', 'Subject-4064', 'Subject-5024', 'Subject-5045', 'Subject-5065', 'Subject-6019', 'Subject-6033', 'Subject-6065', 'Subject-7039', 'Subject-7084', 'Subject-8007', 'Subject-8023', 'Subject-9008', 'Subject-9024', 'Subject-9027', 'Subject-9066', 'Subject-9081', 'Subject-1027', 'Subject-10040', 'Subject-10006', 'Subject-10066', 'Subject-11001', 'Subject-11011', 'Subject-11046', 'Subject-11063', 'Subject-12035', 'Subject-12052', 'Subject-12066', 'Subject-12075', 'Subject-13015', 'Subject-13035', 'Subject-14001', 'Subject-14023', 'Subject-14048', 'Subject-7060', 'Subject-3011', 'Subject-1043', 'Subject-1063', 'Subject-2004', 'Subject-2033', 'Subject-2057', 'Subject-3004', 'Subject-3046', 'Subject-3039', 'Subject-3063', 'Subject-4025', 'Subject-4044', 'Subject-4060', 'Subject-5009', 'Subject-5047', 'Subject-5063', 'Subject-6009', 'Subject-6015', 'Subject-6038', 'Subject-7037', 'Subject-7085', 'Subject-8013', 'Subject-8025', 'Subject-8058', 'Subject-9022', 'Subject-9045', 'Subject-9051', 'Subject-1028', 'Subject-10004', 'Subject-10049', 'Subject-10061', 'Subject-11013', 'Subject-11048', 'Subject-12010', 'Subject-12018', 'Subject-12051', 'Subject-12074', 'Subject-13005', 'Subject-13034', 'Subject-14003', 'Subject-14008', 'Subject-14025', 'Subject-14046', 'Subject-14064', 'Subject-7061', 'Subject-3010', 'Subject-1041', 'Subject-1057', 'Subject-1081', 'Subject-2014', 'Subject-2035', 'Subject-2059', 'Subject-3045', 'Subject-3016', 'Subject-3060', 'Subject-4008', 'Subject-4037', 'Subject-4046', 'Subject-5011', 'Subject-5039', 'Subject-5058', 'Subject-6010', 'Subject-6036', 'Subject-7006', 'Subject-7032', 'Subject-7082', 'Subject-8005', 'Subject-8021', 'Subject-8047', 'Subject-9019', 'Subject-9047', 'Subject-9055', 'Subject-1029', 'Subject-10063', 'Subject-10008', 'Subject-11032', 'Subject-11050', 'Subject-12016', 'Subject-12021', 'Subject-12042', 'Subject-12077', 'Subject-13003', 'Subject-13041', 'Subject-14009', 'Subject-14020', 'Subject-14030', 'Subject-14060', 'Subject-14067', 'Subject-7064', 'Subject-3013', 'Subject-1045', 'Subject-1060', 'Subject-1083', 'Subject-2023', 'Subject-2050', 'Subject-3044', 'Subject-3050', 'Subject-3054', 'Subject-4019', 'Subject-4038', 'Subject-4069', 'Subject-5016', 'Subject-6012', 'Subject-6063', 'Subject-7004', 'Subject-7102', 'Subject-7107', 'Subject-7034', 'Subject-7083', 'Subject-8028', 'Subject-8057', 'Subject-9017', 'Subject-9029', 'Subject-9053', 'Subject-1022', 'Subject-10010', 'Subject-10026', 'Subject-10064', 'Subject-11007', 'Subject-11051', 'Subject-11055', 'Subject-12002', 'Subject-12029', 'Subject-12065', 'Subject-12076', 'Subject-13039', 'Subject-13009', 'Subject-14010', 'Subject-14019', 'Subject-14041', 'Subject-14065', 'Subject-7065', 'Subject-1042', 'Subject-1059', 'Subject-1078', 'Subject-2021', 'Subject-2041', 'Subject-2055', 'Subject-3029', 'Subject-3055', 'Subject-4020', 'Subject-4030', 'Subject-4070', 'Subject-5018', 'Subject-5037', 'Subject-6013', 'Subject-6045', 'Subject-6060', 'Subject-7105', 'Subject-7025', 'Subject-7030', 'Subject-7045', 'Subject-7080', 'Subject-8002', 'Subject-8026', 'Subject-8043', 'Subject-9026', 'Subject-9070', 'Subject-1023', 'Subject-10012', 'Subject-10059', 'Subject-11005', 'Subject-11034', 'Subject-12004', 'Subject-12023', 'Subject-12048', 'Subject-12078', 'Subject-13022', 'Subject-13046', 'Subject-14004', 'Subject-14026', 'Subject-14036', 'Subject-14044', 'Subject-14069', 'Subject-7062', 'Subject-3019', 'Subject-1055', 'Subject-2002', 'Subject-2053', 'Subject-2056', 'Subject-3028', 'Subject-3056', 'Subject-4017', 'Subject-4036', 'Subject-4067', 'Subject-5021', 'Subject-5042', 'Subject-5056', 'Subject-6028', 'Subject-6006', 'Subject-6062', 'Subject-7027', 'Subject-7040', 'Subject-7081', 'Subject-8014', 'Subject-8035', 'Subject-8046', 'Subject-9007', 'Subject-9002', 'Subject-9030', 'Subject-9059', 'Subject-1024', 'Subject-10016', 'Subject-10020', 'Subject-10048', 'Subject-10058', 'Subject-11030', 'Subject-11057', 'Subject-11060', 'Subject-12025', 'Subject-12046', 'Subject-12079', 'Subject-13044', 'Subject-14005', 'Subject-14013', 'Subject-14038', 'Subject-14072', 'Subject-7063', 'Subject-1017', 'Subject-1038', 'Subject-1044', 'Subject-1053', 'Subject-2006', 'Subject-2025', 'Subject-2052', 'Subject-3043', 'Subject-3057', 'Subject-4011', 'Subject-4034', 'Subject-4051', 'Subject-5012', 'Subject-5040', 'Subject-6004', 'Subject-6041', 'Subject-6064', 'Subject-8012', 'Subject-8033', 'Subject-8056', 'Subject-9032', 'Subject-9006', 'Subject-9072', 'Subject-9078', 'Subject-1025', 'Subject-10045', 'Subject-10060', 'Subject-11009', 'Subject-12008', 'Subject-12027', 'Subject-12044', 'Subject-12069', 'Subject-13020', 'Subject-13045', 'Subject-14015', 'Subject-14074', 'Subject-5044', 'Subject-7057', 'Subject-3001', 'Subject-1062', 'Subject-2008', 'Subject-2027', 'Subject-2042', 'Subject-3049', 'Subject-3025', 'Subject-3051', 'Subject-4013', 'Subject-4032', 'Subject-4053', 'Subject-5014', 'Subject-5035', 'Subject-6049', 'Subject-6057', 'Subject-6008', 'Subject-7008', 'Subject-7101', 'Subject-7048', 'Subject-8010', 'Subject-8019', 'Subject-8031', 'Subject-9018', 'Subject-9034', 'Subject-9004', 'Subject-9074', 'Subject-9076', 'Subject-1008', 'Subject-10047', 'Subject-10054', 'Subject-11026', 'Subject-11053', 'Subject-11059', 'Subject-12038', 'Subject-12061', 'Subject-12082', 'Subject-13024', 'Subject-13043', 'Subject-14007', 'Subject-14034', 'Subject-14053', 'Subject-14070', 'Subject-7058', 'Subject-1074', 'Subject-1087', 'Subject-2015', 'Subject-2046', 'Subject-3053', 'Subject-4015', 'Subject-4026', 'Subject-4068', 'Subject-5004', 'Subject-5052', 'Subject-5054', 'Subject-6020', 'Subject-6047', 'Subject-6059', 'Subject-7100', 'Subject-7019', 'Subject-7046', 'Subject-8016', 'Subject-8022', 'Subject-8041', 'Subject-8055', 'Subject-9013', 'Subject-9038', 'Subject-9043', 'Subject-9057', 'Subject-1009', 'Subject-10021', 'Subject-10046', 'Subject-10053', 'Subject-11036', 'Subject-12006', 'Subject-12040', 'Subject-12063', 'Subject-12081', 'Subject-13042', 'Subject-14051', 'Subject-7077', 'Subject-3002', 'Subject-1046', 'Subject-1076', 'Subject-1085', 'Subject-2020', 'Subject-2043', 'Subject-2047', 'Subject-3048', 'Subject-3052', 'Subject-4003', 'Subject-4049', 'Subject-4055', 'Subject-5025', 'Subject-5027', 'Subject-5060', 'Subject-6022', 'Subject-6051', 'Subject-6066', 'Subject-7103', 'Subject-7017', 'Subject-7044', 'Subject-7079', 'Subject-8054', 'Subject-9015', 'Subject-9036', 'Subject-9061', 'Subject-10014', 'Subject-10052', 'Subject-11043', 'Subject-11066', 'Subject-12032', 'Subject-12057', 'Subject-12083', 'Subject-13018', 'Subject-14017', 'Subject-14033', 'Subject-14055', 'Subject-14082', 'Subject-7055', 'Subject-3003', 'Subject-1019', 'Subject-2011', 'Subject-2029', 'Subject-2062', 'Subject-3065', 'Subject-4001', 'Subject-4042', 'Subject-4057', 'Subject-5002', 'Subject-5029', 'Subject-5067', 'Subject-6024', 'Subject-6026', 'Subject-7001', 'Subject-7023', 'Subject-7033', 'Subject-7042', 'Subject-7099', 'Subject-8018', 'Subject-8037', 'Subject-8038', 'Subject-8053', 'Subject-9011', 'Subject-9040', 'Subject-9067', 'Subject-1030', 'Subject-10001', 'Subject-10034', 'Subject-10056', 'Subject-11024', 'Subject-11039', 'Subject-12015', 'Subject-12030', 'Subject-12068', 'Subject-13016', 'Subject-14032', 'Subject-14057', 'Subject-14059', 'Subject-14076', 'Subject-7056', 'Subject-1040', 'Subject-1072', 'Subject-1089', 'Subject-2019', 'Subject-2049', 'Subject-3032', 'Subject-3022', 'Subject-3066', 'Subject-4005', 'Subject-4047', 'Subject-4059', 'Subject-5008', 'Subject-5050', 'Subject-6002', 'Subject-6034', 'Subject-6055', 'Subject-7038', 'Subject-7078', 'Subject-8008', 'Subject-8039', 'Subject-8052', 'Subject-9023', 'Subject-9033', 'Subject-9044', 'Subject-9069', 'Subject-9080', 'Subject-1031', 'Subject-10003', 'Subject-10033', 'Subject-10035', 'Subject-10051', 'Subject-11041', 'Subject-12013', 'Subject-12017', 'Subject-12055', 'Subject-12067', 'Subject-13028', 'Subject-14022', 'Subject-14061', 'Subject-14078', 'Subject-6030', 'Subject-7059', 'Subject-3005', 'Subject-1051', 'Subject-1064', 'Subject-1082', 'Subject-2016', 'Subject-2034', 'Subject-2038', 'Subject-2063', 'Subject-3031', 'Subject-4022', 'Subject-4028', 'Subject-4043', 'Subject-4063', 'Subject-5023', 'Subject-5033', 'Subject-5066', 'Subject-6018', 'Subject-6032', 'Subject-7050', 'Subject-7097', 'Subject-8020', 'Subject-8051', 'Subject-9042', 'Subject-9063', 'Subject-9009', 'Subject-1032', 'Subject-10030', 'Subject-10005', 'Subject-10050', 'Subject-10057', 'Subject-11003', 'Subject-11047', 'Subject-11064', 'Subject-12050', 'Subject-12080', 'Subject-13012', 'Subject-14028', 'Subject-14047', 'Subject-7071', 'Subject-1049', 'Subject-1068', 'Subject-1070', 'Subject-2005', 'Subject-2048', 'Subject-3030', 'Subject-4009', 'Subject-4018', 'Subject-4040', 'Subject-4065', 'Subject-5006', 'Subject-5031', 'Subject-5064', 'Subject-6016', 'Subject-6053', 'Subject-7016', 'Subject-7036', 'Subject-7098', 'Subject-8006', 'Subject-8045', 'Subject-8050', 'Subject-9025', 'Subject-9046', 'Subject-9065', 'Subject-9082', 'Subject-1010', 'Subject-10031', 'Subject-10007', 'Subject-11018', 'Subject-11037', 'Subject-11062', 'Subject-12019', 'Subject-12053', 'Subject-13001', 'Subject-13010', 'Subject-14049', 'Subject-7072', 'Subject-3021', 'Subject-1056', 'Subject-1079', 'Subject-2013', 'Subject-2045', 'Subject-3036', 'Subject-3012', 'Subject-4024', 'Subject-4045', 'Subject-5010', 'Subject-5048', 'Subject-5062', 'Subject-6011', 'Subject-6035', 'Subject-7007', 'Subject-7052', 'Subject-8003', 'Subject-8024', 'Subject-9021', 'Subject-9028', 'Subject-9058', 'Subject-9068', 'Subject-10023', 'Subject-10025', 'Subject-10032', 'Subject-11020', 'Subject-11045', 'Subject-12011', 'Subject-12036', 'Subject-12049', 'Subject-13006', 'Subject-14024', 'Subject-14040', 'Subject-14084', 'Subject-7070', 'Subject-3006', 'Subject-1058', 'Subject-2003', 'Subject-2022', 'Subject-2036', 'Subject-2060', 'Subject-3035', 'Subject-3018', 'Subject-4007', 'Subject-4061', 'Subject-5019', 'Subject-5046', 'Subject-6014', 'Subject-6037', 'Subject-7009', 'Subject-7029', 'Subject-7096', 'Subject-8001', 'Subject-8029', 'Subject-9049', 'Subject-9005', 'Subject-9052', 'Subject-1037', 'Subject-10009', 'Subject-11016', 'Subject-11049', 'Subject-12001', 'Subject-12026', 'Subject-12045', 'Subject-13014', 'Subject-13026', 'Subject-14018', 'Subject-14045', 'Subject-14063', 'Subject-7053', 'Subject-1052', 'Subject-2001', 'Subject-2017', 'Subject-2028', 'Subject-2061', 'Subject-3034', 'Subject-3027', 'Subject-4021', 'Subject-4039', 'Subject-5020', 'Subject-5041', 'Subject-5059', 'Subject-6039', 'Subject-6007', 'Subject-7005', 'Subject-7035', 'Subject-7093', 'Subject-8004', 'Subject-8030', 'Subject-9048', 'Subject-9071', 'Subject-1001', 'Subject-10011', 'Subject-10036', 'Subject-11014', 'Subject-11052', 'Subject-12003', 'Subject-12022', 'Subject-13031', 'Subject-14037', 'Subject-14042', 'Subject-14068', 'Subject-7075', 'Subject-3007', 'Subject-1054', 'Subject-1084', 'Subject-2044', 'Subject-3033', 'Subject-4014', 'Subject-4035', 'Subject-5013', 'Subject-5038', 'Subject-5057', 'Subject-6042', 'Subject-6005', 'Subject-7003', 'Subject-7031', 'Subject-7094', 'Subject-8011', 'Subject-8027', 'Subject-9016', 'Subject-9050', 'Subject-9054', 'Subject-1016', 'Subject-10013', 'Subject-10039', 'Subject-10044', 'Subject-11012', 'Subject-11058', 'Subject-11061', 'Subject-12020', 'Subject-12047', 'Subject-13033', 'Subject-14014', 'Subject-14043', 'Subject-14066', 'Subject-7076', 'Subject-3008', 'Subject-1061', 'Subject-1080', 'Subject-4012', 'Subject-4050', 'Subject-4066', 'Subject-5017', 'Subject-5043', 'Subject-6029', 'Subject-6061', 'Subject-7106', 'Subject-7028', 'Subject-7091', 'Subject-8036', 'Subject-9031', 'Subject-9075', 'Subject-9079', 'Subject-10017', 'Subject-10038', 'Subject-11010', 'Subject-11056', 'Subject-12007', 'Subject-12024', 'Subject-12062', 'Subject-13008', 'Subject-14035', 'Subject-14039', 'Subject-14075', 'Subject-7054', 'Subject-3023', 'Subject-1075', 'Subject-2009', 'Subject-2024', 'Subject-3038', 'Subject-4031', 'Subject-4052', 'Subject-5022', 'Subject-5034', 'Subject-6023', 'Subject-7047', 'Subject-7092', 'Subject-8034', 'Subject-9001', 'Subject-9056', 'Subject-1002', 'Subject-10015', 'Subject-11031', 'Subject-11035', 'Subject-12005', 'Subject-12041', 'Subject-12060', 'Subject-13004', 'Subject-14052', 'Subject-14073', 'Subject-7073', 'Subject-3020', 'Subject-1077', 'Subject-2007', 'Subject-2026', 'Subject-3037', 'Subject-4016', 'Subject-4033', 'Subject-4054', 'Subject-5015', 'Subject-5036', 'Subject-6021', 'Subject-6027', 'Subject-6058', 'Subject-7104', 'Subject-7049', 'Subject-7090', 'Subject-8032', 'Subject-9003', 'Subject-9039', 'Subject-9073', 'Subject-9077', 'Subject-10019', 'Subject-11006', 'Subject-11054', 'Subject-12009', 'Subject-12028', 'Subject-12043', 'Subject-13002', 'Subject-14016', 'Subject-14050', 'Subject-14071', 'Subject-7074', 'Subject-3009', 'Subject-1003', 'Subject-11004', 'Subject-13023', 'Subject-1018', 'Subject-11027', 'Subject-13021', 'Subject-1033', 'Subject-11008', 'Subject-13019', 'Subject-1011', 'Subject-11033', 'Subject-13027', 'Subject-1004', 'Subject-11029', 'Subject-13025', 'Subject-1012', 'Subject-11025', 'Subject-13017', 'Subject-1005', 'Subject-11002', 'Subject-13029', 'Subject-1034', 'Subject-11023', 'Subject-13030', 'Subject-1035', 'Subject-1006', 'Subject-1013', 'Subject-1014', 'Subject-1007', 'Subject-1036', 'Geographical Location', 'Geographical zone in India', 'Gender', 'AGE  in years', 'Life style pattern', 'BMI', 'Obese-Non Obese']
Numerical targets: ['AGE  in years', 'BMI']
Dropping columns: ['AGE  in years', 'BMI', 'Subject_ID']
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-09b305a7c661> in <cell line: 0>()
     87 # Scale data
     88 scaler = StandardScaler()
---> 89 X_scaled = scaler.fit_transform(otu_data)
     90 X_scaled_df = pd.DataFrame(X_scaled, index=otu_data.index, columns=otu_data.columns)
     91 print("Scaling completed successfully.")

/usr/local/lib/python3.11/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs)
    317     @wraps(f)
    318     def wrapped(self, X, *args, **kwargs):
--> 319         data_to_wrap = f(self, X, *args, **kwargs)
    320         if isinstance(data_to_wrap, tuple):
    321             # only wrap the first output for cross decomposition

/usr/local/lib/python3.11/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    916         if y is None:
    917             # fit method of arity 1 (unsupervised transformation)
--> 918             return self.fit(X, **fit_params).transform(X)
    919         else:
    920             # fit method of arity 2 (supervised transformation)

/usr/local/lib/python3.11/dist-packages/sklearn/preprocessing/_data.py in fit(self, X, y, sample_weight)
    892         # Reset internal state before fitting
    893         self._reset()
--> 894         return self.partial_fit(X, y, sample_weight)
    895 
    896     @_fit_context(prefer_skip_nested_validation=True)

/usr/local/lib/python3.11/dist-packages/sklearn/base.py in wrapper(estimator, *args, **kwargs)
   1387                 )
   1388             ):
-> 1389                 return fit_method(estimator, *args, **kwargs)
   1390 
   1391         return wrapper

/usr/local/lib/python3.11/dist-packages/sklearn/preprocessing/_data.py in partial_fit(self, X, y, sample_weight)
    928         """
    929         first_call = not hasattr(self, "n_samples_seen_")
--> 930         X = validate_data(
    931             self,
    932             X,

/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py in validate_data(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)
   2942             out = X, y
   2943     elif not no_val_X and no_val_y:
-> 2944         out = check_array(X, input_name="X", **check_params)
   2945     elif no_val_X and not no_val_y:
   2946         out = _check_y(y, **check_params)

/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   1128         n_samples = _num_samples(array)
   1129         if n_samples < ensure_min_samples:
-> 1130             raise ValueError(
   1131                 "Found array with %d sample(s) (shape=%s) while a"
   1132                 " minimum of %d is required%s."

ValueError: Found array with 0 sample(s) (shape=(0, 1009)) while a minimum of 1 is required by StandardScaler.

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# Load the latest OTU file
otu_filepath = "/mnt/data/pan_otutab.csv"
otu_df = pd.read_csv(otu_filepath)
common_subjects = list(set(meta_df["Subject_ID"]) & set(otu_df.columns[1:]))

otu_filtered = otu_df[["OTU_ID"] + common_subjects].set_index("OTU_ID").T
meta_filtered = meta_df.set_index("Subject_ID").loc[common_subjects]

# Extract BMI as target variable and convert to numeric
y = pd.to_numeric(meta_filtered["BMI"], errors='coerce')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(otu_filtered.fillna(0))

# Cross Validation
lasso_cv = LassoCV(cv=5, random_state=42, n_alphas=100)
lasso_cv.fit(X_scaled, y)

# Extract coefficients and plot regularization path
lasso_coeffs = pd.Series(lasso_cv.coef_, index=otu_filtered.columns)
lasso_coeffs.sort_values(ascending=False).head(10)

     

from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import Lasso

# Reduce features: Keep only OTUs with high variance
selector = VarianceThreshold(threshold=0.001)  # Remove near-constant OTUs
X_reduced = selector.fit_transform(X_scaled)

# Perform LASSO regression with a higher iteration limit
lasso = Lasso(alpha=0.01, max_iter=5000, random_state=42)
lasso.fit(X_reduced, y)

# Extract nonzero coefficients to show selected OTUs
selected_otus = otu_filtered.columns[selector.get_support()]
lasso_coeffs = pd.Series(lasso.coef_, index=selected_otus)
lasso_coeffs[lasso_coeffs != 0].sort_values(ascending=False).head(10)

     

import matplotlib.pyplot as plt
import numpy as np

alphas = np.logspace(-4, 0, 100)
lasso_paths = [Lasso(alpha=a, max_iter=5000).fit(X_reduced, y).coef_ for a in alphas]
lasso_paths = np.array(lasso_paths).T

#Plotting the regularization path
plt.figure(figsize=(10, 6))
for i in range(lasso_paths.shape[0]):
    plt.plot(alphas, lasso_paths[i], label=f'OTU {i}' if i < 5 else "", alpha=0.6)

plt.xscale("log")
plt.xlabel("Alpha (L1 penalty)")
plt.ylabel("Coefficient Value")
plt.title("LASSO Regularization Path")
plt.axvline(lasso.alpha_, color='red', linestyle="--", label="Selected Alpha")
plt.legend()
plt.show()

     


variance = np.var(X_scaled, axis=0)
top_100_indices = np.argsort(variance)[-100:]
X_top100 = X_scaled[:, top_100_indices]

#Aloha values
alphas_subset = np.logspace(-3, -1, 20)
lasso_paths = [Lasso(alpha=a, max_iter=5000).fit(X_top100, y).coef_ for a in alphas_subset]
lasso_paths = np.array(lasso_paths).T

# Plot the regularization path
plt.figure(figsize=(10, 6))
for i in range(lasso_paths.shape[0]):
    plt.plot(alphas_subset, lasso_paths[i], label=f'OTU {i}' if i < 5 else "", alpha=0.6)

plt.xscale("log")
plt.xlabel("Alpha (L1 penalty)")
plt.ylabel("Coefficient Value")
plt.title("LASSO Regularization Path (Top 100 OTUs)")
plt.axvline(lasso.alpha_, color='red', linestyle="--", label="Selected Alpha")
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y, y_pred_top100, alpha=0.6, label="Predicted vs. Actual", color="blue")

plt.xlabel("Actual BMI")
plt.ylabel("Predicted BMI")
plt.title(f"LASSO Regression (Top 100 OTUs): Actual vs Predicted BMI (R² = {r2_top100:.2f})")
plt.legend()
plt.show()

     