In [1]:
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine, MetaData, Table
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# from airflow import DAG
# from airflow.operators.python_operator import PythonOperator
# from airflow.operators.dummy_operator import DummyOperator

In [2]:
# Definir los argumentos del DAG
default_args = {
    'owner': 'Oscar C',
    'depends_on_past': False,
    'email_on_failure': False,
    'email': ['oecorrechag@gmail.com'],
    'retries': 1,
    'start_date': datetime(2024, 5, 20),
    'retry_delay': timedelta(minutes=1),
}

In [None]:
def drop_table(table_name):
    # Conexión a MySQL (en docker)
    engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    metadata = MetaData()
    mi_tabla = Table(table_name, metadata)
    mi_tabla.drop(engine)
    ## otra forma de eliminar
    # metadata.drop_all(engine, tables=[mi_tabla])

# drop_table('raw')
# drop_table('clean')

In [6]:
def input_data():
    # Conexión a la base de datos MySQL
    # engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')
    # Consulta para cargar los datos desde la tabla en la base de datos
    query = "SELECT * FROM raw_data"
    # Leer los datos desde MySQL
    df = pd.read_sql(query, con=engine)
    return df.head()

input_data()

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,54239.0,sold,275000.0,1.0,1.0,,1617038.0,Miami,Florida,33156.0,846.0,2022-02-28
1,90564.0,sold,399900.0,1.0,1.0,,1497499.0,San Diego,California,92108.0,667.0,2022-04-28
2,53271.0,for_sale,75000.0,,,2.25,1877529.0,Oceola Township,Michigan,48855.0,,
3,12926.0,sold,325000.0,3.0,2.0,0.09,892999.0,Worcester,Massachusetts,1603.0,1409.0,2021-11-29
4,79221.0,for_sale,169900.0,,,3.7,1998116.0,Holmen,Wisconsin,54636.0,,


In [5]:
def raw_data():

    # Conexión a MySQL (en docker)
    # engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')


    # load data
    df = pd.read_csv('data/realtor-data.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
    df.columns = ['brokered_by','status','price','bed','bath','acre_lot','street','city','state',
                  'zip_code','house_size','prev_sold_date']
    # tomar el 10% para que guarde
    df = df.sample(frac=0.1, random_state=42)
    print(df.shape)

    # Guardar los datos en MySQL
    df.to_sql('raw_data', con=engine, if_exists='append', index=False)


    print("Datos raw_data guardados en MySQL") 

    return df.head()

raw_data()

(222638, 12)
Datos raw_data guardados en MySQL


Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
1696936,54239.0,sold,275000.0,1.0,1.0,,1617038.0,Miami,Florida,33156.0,846.0,2022-02-28
2092671,90564.0,sold,399900.0,1.0,1.0,,1497499.0,San Diego,California,92108.0,667.0,2022-04-28
742044,53271.0,for_sale,75000.0,,,2.25,1877529.0,Oceola Township,Michigan,48855.0,,
1424136,12926.0,sold,325000.0,3.0,2.0,0.09,892999.0,Worcester,Massachusetts,1603.0,1409.0,2021-11-29
812329,79221.0,for_sale,169900.0,,,3.7,1998116.0,Holmen,Wisconsin,54636.0,,


In [8]:
def clean_data():
    # Conexión a la base de datos MySQL
    # engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')
    # Consulta para cargar los datos desde la tabla en la base de datos
    query = "SELECT * FROM raw_data"
    # Leer los datos desde MySQL
    df = pd.read_sql(query, con=engine)


    # Selecciono como prueba solo las variables numericas
    df = df.loc[:,['price','bed','bath','acre_lot','street','house_size']]
    # Eliminar los registros con faltantes
    df = df.dropna()
    # Convertir en string el zip code
    df['zip_code'] = df['zip_code'].astype(str)
    # print(df.shape)
    

    # Guardar los datos en MySQL
    df.to_sql('clean_data', con=engine, if_exists='append', index=False)

    print("Datos limpios guardados en MySQL") 

    return df.head()

clean_data() 

(108064, 12)
Datos limpios guardados en MySQL


Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
3,12926.0,sold,325000.0,3.0,2.0,0.09,892999.0,Worcester,Massachusetts,1603.0,1409.0,2021-11-29
5,92677.0,sold,265000.0,2.0,1.0,0.12,819429.0,Merced,California,95340.0,901.0,2021-11-10
8,44366.0,for_sale,292000.0,4.0,3.0,0.4,1550810.0,Lincoln,Nebraska,68521.0,1949.0,2022-01-05
10,96476.0,sold,120000.0,2.0,2.0,9.24,1102814.0,Zephyrhills,Florida,33542.0,972.0,2022-02-16
11,53468.0,sold,205000.0,3.0,2.0,0.29,1037458.0,Temple,Texas,76502.0,1868.0,2022-01-14


In [None]:
def load_and_slip():
    # Conexión a la base de datos MySQL
    engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    # Consulta para cargar los datos desde la tabla en la base de datos
    query = "SELECT * FROM clean_data"
    # Leer los datos desde MySQL
    df = pd.read_sql(query, con=engine)
    # Convertir las columnas 'Sex' y 'Species' a tipo categórico
    # df[['Wilderness_Area', 'Soil_Type','Cover_Type']] = df[['Wilderness_Area', 'Soil_Type','Cover_Type']].astype('category')
    # Dividir los datos en características (X) y etiquetas (y)
    X = df.drop(columns='price')
    y = df['price']
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
    
    print("Datos limpios cargados desde MySQL")  

    return X_train, X_test, y_train, y_test

load_and_slip()

In [None]:
print('ok_')