In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import joblib

In [None]:
df = pd.read_csv("/kaggle/input/genomic-data/variant_summary.txt", sep='\t', low_memory=False)
df = df[df['ClinicalSignificance'].notnull()].copy()

In [None]:
def classify_pathogenicity(value):
    v = value.lower()
    if "pathogenic" in v and "benign" not in v:
        return 1
    elif "benign" in v and "pathogenic" not in v:
        return 0
    else:
        return None

df['label'] = df['ClinicalSignificance'].apply(classify_pathogenicity)
df = df[df['label'].isin([0, 1])]

In [None]:
df['gene_name_length'] = df['GeneSymbol'].apply(lambda x: len(str(x)))
df['ref_len'] = df['ReferenceAllele'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
df['alt_len'] = df['AlternateAllele'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
df['allele_length_diff'] = df['alt_len'] - df['ref_len']
df = pd.get_dummies(df, columns=['Type'], prefix='variant_type')

In [None]:
feature_cols = ['gene_name_length', 'allele_length_diff'] + [col for col in df.columns if col.startswith('variant_type_')]
X = df[feature_cols]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = {0: weights[0], 1: weights[1]}

model = RandomForestClassifier(n_estimators=100, class_weight=class_weights, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

In [None]:
joblib.dump(model, "rf_clinvar_model.pkl")

In [None]:
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
importances = model.feature_importances_
features = pd.Series(importances, index=feature_cols)
features.sort_values(ascending=False).plot(kind='bar', title="Feature Importances", figsize=(12, 5))
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test[:500])
shap.summary_plot(shap_values[1], X_test[:500], plot_type="dot")