# This is test jupyter notebook file, it tests the "tools/modeling.py".

## Note: Some of the functions in the "tools/get_precipitation_data.py" file operate under specific assumptions, and the expected output is unpredictable even under these assumptions. Therefore, they were not tested in this test, but they all working properly in the project jupyter notebook files(try to re-run all the jupyter notebooks under from source code folder)

In [1]:
# Prepare a bit of packages:
from google.colab import drive
drive.mount('/content/gdrive')
import sys
tool_folder_dir = "/content/gdrive/MyDrive/irp_project_111/source code/tools"
sys.path.append(tool_folder_dir)
!pip install ipytest

Mounted at /content/gdrive
Collecting ipytest
  Downloading ipytest-0.13.3-py3-none-any.whl (14 kB)
Collecting jedi>=0.16 (from ipython->ipytest)
  Downloading jedi-0.19.0-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, ipytest
Successfully installed ipytest-0.13.3 jedi-0.19.0


In [2]:
"""
Author: Chaofan Wu
Student ID: 02285924
Email: cw522@ic.ac.uk
Project Name: Predicting flood risk in Ghana
Supervisors:
    Sesinam Dagadu(MEng)
    Yves Plancherel(PhD)
Company: SnooCODE
Date: 08/2023
"""

# Setup ipytest
import ipytest
ipytest.autoconfig()

import modeling
import ipytest
import pandas as pd
import numpy as np
from pandas.testing import assert_frame_equal
import pytest
import tempfile
import os



def test_load_data():
    """Test for load_data"""
    with tempfile.TemporaryDirectory() as tmpdirname:
        sample_df = pd.DataFrame({'col1': [1, 2], 'col2': ['a', 'b']})
        csv_path = os.path.join(tmpdirname, 'sample.csv')
        sample_df.to_csv(csv_path, index=False)
        loaded_df = modeling.load_data(csv_path)
        assert_frame_equal(loaded_df, sample_df)

def test_check_missing_values(capfd):
    """Test for check_missing_values"""
    df = pd.DataFrame({'col1': [1, 2, np.nan], 'col2': ['a', 'b', 'c']})
    modeling.check_missing_values(df)
    captured = capfd.readouterr()
    assert "col1    1" in captured.out

def test_deal_with_nan():
    """Test for deal_with_nan"""
    df = pd.DataFrame({
        'Flood_ID': ['A_1', 'A_2', 'B_1', 'B_2'],
        'val': [1, np.nan, 3, np.nan]
    })
    filled_df = modeling.deal_with_nan(df, ['A', 'B'])
    expected_df = pd.DataFrame({
        'Flood_ID': ['A_1', 'A_2', 'B_1', 'B_2'],
        'val': [1.0, 1.0, 3.0, 3.0]  # Use float values here
    })
    assert_frame_equal(filled_df, expected_df)

def test_get_datasets_for_Modeling():
    """Test for get_datasets_for_Modeling"""
    sample_df = pd.DataFrame({
        'Flood_ID': ['A', 'A', 'B', 'B', 'B'],
        'feature_1': [1, 1, 2, 2, 2],
        'feature_2': [1, 1, 3, 3, 3]
    })

    # Assume discard_duplicates removes all duplicate rows
    expected_dataset_1 = pd.DataFrame({
        'feature_1': [1],
        'feature_2': [1]
    })

    expected_dataset_2 = pd.DataFrame({
        'feature_1': [2],
        'feature_2': [3]
    })

    datasets = modeling.get_datasets_for_Modeling(sample_df)

    assert_frame_equal(datasets[0].reset_index(drop=True), expected_dataset_1.reset_index(drop=True))
    assert_frame_equal(datasets[1].reset_index(drop=True), expected_dataset_2.reset_index(drop=True))


def test_preprocess_datasets():
    """Test for preprocess_datasets"""
    df1 = pd.DataFrame({
        'feature_1': [1, 2, 3],
        'feature_2': ['a', 'b', 'c'],
        'label': [0, 1, 0]
    })

    df2 = pd.DataFrame({
        'feature_1': [4, 5, 6],
        'feature_2': ['d', 'e', 'f'],
        'label': [1, 0, 1]
    })

    datasets = [df1, df2]

    X_train, y_train, X_val, y_val, X_test, y_test = modeling.preprocess_datasets(datasets, 1)

    # Check shapes
    assert X_train.shape[0] == y_train.shape[0]
    assert X_val.shape[0] == y_val.shape[0]
    assert X_test.shape[0] == y_test.shape[0]

    # Check that columns match across X_train, X_val, and X_test
    assert set(X_train.columns) == set(X_val.columns)
    assert set(X_val.columns) == set(X_test.columns)
    assert set(X_test.columns) == set(X_train.columns)


#Start test:
ipytest.run()

[32m.[0m[32m.[0m[32m.[0m

Splitting dataset:   0%|          | 0/2 [00:00<?, ?it/s]

Processing subsets:   0%|          | 0/2 [00:00<?, ?it/s]

[32m.[0m[32m.[0m[32m                                                                                        [100%][0m
[32m[32m[1m5 passed[0m[32m in 0.22s[0m[0m


<ExitCode.OK: 0>