In [3]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
import papermill as pm

# Function to run rawdata.ipynb (data collection)
def run_rawdata_notebook():
    input_notebook = 'D:\\livedata\\rawdata.ipynb'  # Path to your rawdata.ipynb notebook
    output_notebook = 'D:\\livedata\\rawdata.csv'  # Path where the output of the notebook will be saved
    pm.execute_notebook(input_notebook, output_notebook)

# Function to run processing.ipynb (data preprocessing)
def run_processing_notebook():
    input_notebook = 'D:\\livedata\\processing.ipynb'  # Path to your processing.ipynb notebook
    output_notebook = 'D:\\livedata\\processeddata.csv'  # Path where the output of the notebook will be saved
    pm.execute_notebook(input_notebook, output_notebook)

# Define the DAG
dag = DAG(
    'weather_pipeline',  # DAG name
    description='Automated pipeline for weather data collection and preprocessing',
    schedule_interval=timedelta(hours=1),  # Set your desired schedule (e.g., every hour)
    start_date=datetime(2024, 11, 1),  # Starting date for the DAG
    catchup=False,  # Do not backfill past missed runs
)

# Start task (DummyOperator just to indicate start of the pipeline)
start_task = DummyOperator(
    task_id='start',
    dag=dag,
)

# Task 1: Run rawdata.ipynb (data collection)
task1 = PythonOperator(
    task_id='run_rawdata_notebook', 
    python_callable=run_rawdata_notebook, 
    dag=dag,
)

# Task 2: Run processing.ipynb (data preprocessing)
task2 = PythonOperator(
    task_id='run_processing_notebook', 
    python_callable=run_processing_notebook, 
    dag=dag,
)

# End task (DummyOperator just to indicate end of the pipeline)
end_task = DummyOperator(
    task_id='end',
    dag=dag,
)

# Set the task dependencies
start_task >> task1 >> task2 >> end_task  # Task 2 depends on Task 1


<Task(EmptyOperator): end>

In [2]:
pip install papermill


Collecting papermill
  Downloading papermill-2.6.0-py3-none-any.whl.metadata (13 kB)
Collecting nbformat>=5.2.0 (from papermill)
  Downloading nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting nbclient>=0.2.0 (from papermill)
  Downloading nbclient-0.10.0-py3-none-any.whl.metadata (7.8 kB)
Collecting ansicolors (from papermill)
  Downloading ansicolors-1.1.8-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting fastjsonschema>=2.15 (from nbformat>=5.2.0->papermill)
  Downloading fastjsonschema-2.20.0-py3-none-any.whl.metadata (2.1 kB)
Downloading papermill-2.6.0-py3-none-any.whl (38 kB)
Downloading nbclient-0.10.0-py3-none-any.whl (25 kB)
Downloading nbformat-5.10.4-py3-none-any.whl (78 kB)
   ---------------------------------------- 0.0/78.5 kB ? eta -:--:--
   ---------------------------------------- 78.5/78.5 kB 4.3 MB/s eta 0:00:00
Downloading ansicolors-1.1.8-py2.py3-none-any.whl (13 kB)
Downloading fastjsonschema-2.20.0-py3-none-any.whl (23 kB)
Installing collected packag


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\PMYLS\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
