In [None]:
import pandas as pd
import psycopg2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Cargar datos preprocesados desde RDS (PostgreSQL)
def load_from_rds():
    try:
        conn = psycopg2.connect(
            host="your-rds-endpoint",  # Reemplaza con tu endpoint de RDS
            user="admin",  # Reemplaza con tu usuario de RDS
            password="password",  # Reemplaza con tu contraseña de RDS
            dbname="your-database",  # Reemplaza con tu base de datos de RDS
            port="5432"  # Puerto estándar de PostgreSQL
        )
        query = "SELECT * FROM preprocessed_data"  # Reemplaza con tu tabla de datos preprocesados
        df = pd.read_sql(query, conn)
        conn.close()
        return df
    except Exception as e:
        print(f"Error al cargar los datos desde RDS: {e}")

# Cargar los datos preprocesados desde RDS
data_filtered = load_from_rds()

# Codificar las columnas categóricas si es necesario
le = LabelEncoder()

# Codificar 'clinical_significance' para que sea numérica
data_filtered['clinical_significance'] = le.fit_transform(data_filtered['clinical_significance'])

# Codificar otras columnas categóricas que contengan texto
categorical_columns = ['consequence_type', 'alleles', 'feature_type']
for col in categorical_columns:
    data_filtered[col] = le.fit_transform(data_filtered[col])

# Separar variables independientes (X) y dependiente (y)
X = data_filtered.drop(columns=['clinical_significance'])
y = data_filtered['clinical_significance']

# Dividir el dataset en entrenamiento y prueba (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred = rf_model.predict(X_test)

# Evaluar el modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"Exactitud del modelo: {accuracy:.4f}")
print("Reporte de Clasificación:")
print(classification_report(y_test, y_pred))

# Guardar resultados de predicciones en RDS (PostgreSQL)
def save_predictions_to_rds(X_test, predictions):
    try:
        conn = psycopg2.connect(
            host="your-rds-endpoint",  # Reemplaza con tu endpoint de RDS
            user="admin",  # Reemplaza con tu usuario de RDS
            password="password",  # Reemplaza con tu contraseña de RDS
            dbname="your-database",  # Reemplaza con tu base de datos de RDS
            port="5432"  # Puerto estándar de PostgreSQL
        )
        cursor = conn.cursor()
        for idx, pred in enumerate(predictions):
            cursor.execute("INSERT INTO predictions (variation_id, prediction) VALUES (%s, %s)",
                           (X_test.index[idx], pred))
        conn.commit()
        cursor.close()
        conn.close()
        print("Predicciones guardadas exitosamente en RDS.")
    except Exception as e:
        print(f"Error al guardar las predicciones en RDS: {e}")

# Guardar las predicciones en RDS
save_predictions_to_rds(X_test, y_pred)



In [None]:
CREATE TABLE preprocessed_data (
    consequence_type VARCHAR(255),
    clinical_significance VARCHAR(255),
    feature_type VARCHAR(255),
    start INT,
    "end" INT,
    strand INT,
    alleles VARCHAR(1000)
);

CREATE TABLE predictions (
    variation_id INT,
    prediction INT
);


In [None]:
# Dockerfile for preprocessing data and saving to S3

# Use an official Python runtime as a parent image
FROM python:3.9-slim

# Set the working directory in the container
WORKDIR /usr/src/app

# Copy the current directory contents into the container at /usr/src/app
COPY . .

# Install any necessary dependencies
RUN pip install --no-cache-dir pandas boto3

# Command to run the script
CMD ["python", "./preprocessing_to_s3.py"]


In [None]:
# Dockerfile for loading data from S3, training a model, and saving predictions to RDS

# Use an official Python runtime as a parent image
FROM python:3.9-slim

# Set the working directory in the container
WORKDIR /usr/src/app

# Copy the current directory contents into the container at /usr/src/app
COPY . .

# Install any necessary dependencies
RUN pip install --no-cache-dir pandas scikit-learn mysql-connector-python boto3

# Command to run the script
CMD ["python", "./modeling_from_s3_to_rds.py"]
