### Importando as bibliotecas

In [None]:
import pymysql
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report




### Configurações do banco de dados MySQL

In [None]:
# Configurações do MySQL a partir das variáveis de ambiente
mysql_host = os.getenv('MYSQL_HOST')
mysql_port = int(os.getenv('MYSQL_PORT'))
mysql_user = os.getenv('MYSQL_USER')
mysql_password = os.getenv('MYSQL_PASSWORD')
mysql_db = os.getenv('MYSQL_DB')
mysql_table = os.getenv('MYSQL_TABLE')

# Conectando ao MySQL
connection = pymysql.connect(
    host=mysql_host,
    port=mysql_port,
    user=mysql_user,
    password=mysql_password,
    database=mysql_db
)

#### Machine Learning

In [None]:

# Carregar dados normalizados do MySQL
query = """
SELECT
    (red_pixel_percentage - (SELECT MIN(red_pixel_percentage) FROM fact_anaemia)) / 
    ((SELECT MAX(red_pixel_percentage) FROM fact_anaemia) - (SELECT MIN(red_pixel_percentage) FROM fact_anaemia)) AS norm_red_pixel_percentage,
    (green_pixel_percentage - (SELECT MIN(green_pixel_percentage) FROM fact_anaemia)) / 
    ((SELECT MAX(green_pixel_percentage) FROM fact_anaemia) - (SELECT MIN(green_pixel_percentage) FROM fact_anaemia)) AS norm_green_pixel_percentage,
    (blue_pixel_percentage - (SELECT MIN(blue_pixel_percentage) FROM fact_anaemia)) / 
    ((SELECT MAX(blue_pixel_percentage) FROM fact_anaemia) - (SELECT MIN(blue_pixel_percentage) FROM fact_anaemia)) AS norm_blue_pixel_percentage,
    (hemoglobin_level - (SELECT MIN(hemoglobin_level) FROM fact_anaemia)) / 
    ((SELECT MAX(hemoglobin_level) FROM fact_anaemia) - (SELECT MIN(hemoglobin_level) FROM fact_anaemia)) AS norm_hemoglobin_level,
    sex_id,
    anaemic_status
FROM fact_anaemia;
"""

# Ler dados do MySQL para um DataFrame do pandas
df = pd.read_sql(query, connection)

# Dividir dados em atributos e rótulos
X = df.drop('anaemic_status', axis=1)
y = df['anaemic_status']

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalizar os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Treinar um modelo de regressão logística
model = LogisticRegression()
model.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = model.predict(X_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.9375
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        25
           1       1.00      0.71      0.83         7

    accuracy                           0.94        32
   macro avg       0.96      0.86      0.90        32
weighted avg       0.94      0.94      0.93        32



