In [16]:
## Load packages
import sys
import os

# Add the current directory to PYTHONPATH
sys.path.append(os.getcwd())
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from scipy.spatial import KDTree



In [None]:
##General preprocessing pipeline

class FloodRiskPreprocessingPipeline:
    """Enhanced preprocessing pipeline for flood risk df."""

    def __init__(self):
        self.one_hot_encoder = None
        self.scaler = None
        self.imputers = {}
        self.log = []  # Log actions for debugging and reporting

    def drop_duplicates(self, df):
        df.drop_duplicates(inplace = True)



    @staticmethod
    def standardize_postcode(postcode):
        """Standardizes a postcode to the format 'SW1A 1AA'."""
        if isinstance(postcode, str):
            postcode = postcode.strip().upper()
            match = re.match(r'^([A-Z]{1,2}[0-9][A-Z0-9]?)(\s*?)([0-9][A-Z]{2})$', postcode)
            if match:
                return f"{match.group(1)} {match.group(3)}"
        return postcode

    def merge_dfsets(self, df, sector_data=None, station_data=None, district_data=None):
        """Merges additional datasets into the main DataFrame."""
        # Merge sector data
        if sector_data is not None and "sector" in df.columns:
            df = df.merge(sector_data, on="sector", how="left")
            self.log.append("Merged 'sector_data' into the main dataset.")

        # Merge district data
        if district_data is not None and "postcodeDistrict" in df.columns:
            df = df.merge(district_data, on="postcodeDistrict", how="left")
            self.log.append("Merged 'district_data' into the main dataset.")

        # Add distance to nearest station(Improves geospatial understanding of flood risks by correlating risk with monitored data.)
        if station_data is not None:
            station_coords = station_data[["latitude", "longitude"]].to_numpy()
            postcode_coords = df[["northing", "easting"]].to_numpy()
            distances = cdist(postcode_coords, station_coords, metric="euclidean")
            df["distance_to_station"] = distances.min(axis=1)
            self.log.append("Added 'distance_to_station' feature using station data.")

        return df
    
    #computer the nearest point based on 3D 

    def handle_missing_data(self, df, categorical_columns, numeric_columns, method="median"):
        """Handles missing data using the specified imputation strategy."""
        # Impute categorical columns with the mode
        for col in categorical_columns:
            if col in df.columns:
                mode_imputer = SimpleImputer(strategy="most_frequent")
                df[col] = mode_imputer.fit_transform(df[[col]])
                self.imputers[col] = mode_imputer
                self.log.append(f"Imputed missing values in '{col}' using mode.")

        # Impute numeric columns
        for col in numeric_columns:
            if col in df.columns:
                imputer = SimpleImputer(strategy=method)
                df[col] = imputer.fit_transform(df[[col]])
                self.imputers[col] = imputer
                self.log.append(f"Imputed missing values in '{col}' using {method} strategy.")

        return df


    def scale_numeric_features(self, df, numeric_columns, scaling_type="standard"):
        """Scales numeric features using the specified scaler."""
        if scaling_type == "standard":
            self.scaler = StandardScaler()
        elif scaling_type == "minmax":
            self.scaler = MinMaxScaler()
        elif scaling_type == 'robustscaler':
            self.scaler = RobustScaler()
        else:
            raise ValueError("Unsupported scaling_type. Use 'standard' or 'minmax'.")

        numeric_columns = [col for col in numeric_columns if col in df.columns]
        if numeric_columns:
            df[numeric_columns] = self.scaler.fit_transform(df[numeric_columns])
            self.log.append(f"Scaled numeric columns: {numeric_columns} using {scaling_type} scaling.")
        return df

    def feature_engineering(self, df, sector_data=None, station_data=None):
        """Adds derived features like proximity risk and population density."""
        # Calculate proximity risk
        if "distanceToWatercourse" in df.columns and "elevation" in df.columns:
            df["proximity_risk"] = df["distanceToWatercourse"] / (df["elevation"] + 1)
            self.log.append("Added 'proximity_risk' feature.")

        # Add population density from sector data
        if sector_data is not None and "sector" in df.columns:
            sector_columns = ["sector", "population", "households", "numberOfPostcodeUnits"]
            df["population_density"] = df["population"] / df["households"]
            self.log.append("Added 'population_density' feature from sector data.")


        return df
    
   
    def interaction_general(self, df):
        # Initialize label encoders
        label_encoder_soil = LabelEncoder()
        label_encoder_watercourse = LabelEncoder()

        # Encode 'soilType' and 'nearestWatercourse'
        df['soilType_encoded'] = label_encoder_soil.fit_transform(df['soilType'])
        df['nearestWatercourse_encoded'] = label_encoder_watercourse.fit_transform(df['nearestWatercourse'])

        # Calculate bins for elevation and distanceToWatercourse
        percentiles = [0, 0.25, 0.5, 0.75, 1.0]
        bins_elevation = df['elevation'].quantile(percentiles).values
        bins_distance = df['distanceToWatercourse'].quantile(percentiles).values

        # Define bin labels
        bin_labels = ['Low-Mid', 'Mid', 'Mid-High', 'High']

         # Perform binning
        df['elevation_category'] = pd.cut(df['elevation'], bins=bins_elevation, labels=bin_labels, include_lowest=True)
        df['distanceToWatercourse_category'] = pd.cut(df['distanceToWatercourse'], bins=bins_distance, labels=bin_labels, include_lowest=True)
    
        # Combine categorical columns for interaction terms
        df['soilType/Elevation'] = df['soilType_encoded'].astype(str) + '/' + df['elevation_category'].astype(str)
        df['distanceToWatercourse/nearestWatercourse'] = df['distanceToWatercourse_category'].astype(str) + '/' + df['nearestWatercourse_encoded'].astype(str)
  

        # Apply label encoding to interaction columns
        interaction_encoders = {col: LabelEncoder() for col in ['soilType/Elevation', 'distanceToWatercourse/nearestWatercourse' ]}
        for col, encoder in interaction_encoders.items():
            df[col] = encoder.fit_transform(df[col])

    # Concatenate the interaction DataFrame back to the original DataFrame

        # Drop unnecessary intermediate encoded columns
        df.drop(columns=['soilType_encoded', 'nearestWatercourse_encoded', 'elevation_category', 'distanceToWatercourse_category'], inplace=True)

        return df



    def preprocess(self, df, categorical_columns=None, numeric_columns=None, scaling_type="standard",
                   imputation_method="median", sector_data=None, station_data=None, district_data=None):
        """
        Executes the complete preprocessing pipeline.
        """
        if categorical_columns is None:
            categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        # Standardize postcodes
        if "postcode" in df.columns:
            df["postcode"] = df["postcode"].apply(self.standardize_postcode)
            self.log.append("Standardized 'postcode' column.")

        # Merge supporting datasets
        df = self.merge_datasets(df, sector_data, station_data, district_data)

        # Handle missing data
        df = self.handle_missing_data(df, categorical_columns, numeric_columns, method=imputation_method)

        # Encode categorical features
        df = self.encode_categorical_features(df, categorical_columns)

        # Feature engineering
        df = self.feature_engineering(df, sector_data, station_data)

        # Scale numeric features
        df = self.scale_numeric_features(df, numeric_columns, scaling_type=scaling_type)

        return df

    def generate_report(self):
        """Generates a summary of preprocessing actions."""
        return "\n".join(self.log)


In [None]:

# Helper function to convert lat/lon/elevation to Cartesian coordinates
def lat_lon_to_cartesian(lat, lon, elevation, earth_radius=6371):
    """Converts latitude, longitude, and elevation to Cartesian coordinates."""
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    r = earth_radius + (elevation / 1000.0)
    x = r * np.cos(lat_rad) * np.cos(lon_rad)
    y = r * np.cos(lat_rad) * np.sin(lon_rad)
    z = r * np.sin(lat_rad)
    return x, y, z

class FloodRiskPreprocessingPipeline:
    """Enhanced preprocessing pipeline for flood risk datasets."""

    def __init__(self):
        self.scaler = None
        self.log = []
    
    def drop_duplicates(self, df):
        """Drops duplicate rows from the dataset."""
        original_count = len(df)
        df.drop_duplicates(inplace=True)
        dropped_count = original_count - len(df)
        self.log.append(f"Dropped {dropped_count} duplicate rows.")
        return df
    @staticmethod
    
    def standardize_postcode(postcode):
        """Standardizes a postcode to the format 'SW1A 1AA'."""
        if isinstance(postcode, str):
            postcode = postcode.strip().upper()
            match = re.match(r'^([A-Z]{1,2}[0-9][A-Z0-9]?)(\s*?)([0-9][A-Z]{2})$', postcode)
            if match:
                return f"{match.group(1)} {match.group(3)}"
        return postcode

    def preprocess_postcode(self, df):
        """Splits the 'postcode' column into 'postcodeSector' and 'postcodeDistrict'."""
        if "postcode" in df.columns:
            df["postcodeSector"] = df["postcode"].str.split(" ").str[0]
            df["postcodeDistrict"] = df["postcode"].str.extract(r'^([A-Z]{1,2}\d{1,2}\s?\d?)')[0]
            self.log.append("Extracted 'postcodeSector' and 'postcodeDistrict' from 'postcode'.")
        return df

    def merge_sector_data(self, df, sector_data):
        """Merges sector-level data."""
        if "postcodeSector" in df.columns:
            df = df.merge(sector_data, on="postcodeSector", how="left")
            self.log.append("Merged 'sector_data' into the main dataset.")
        return df

    def merge_district_data(self, df, district_data):
        """Merges district-level data."""
        if "postcodeDistrict" in df.columns:
            df = df.merge(district_data, on="postcodeDistrict", how="left")
           # self.log.append("Merged 'district_data' into the main dataset.")
        return df
    
    def increase_negative_for_log(self, df):
        for col in df.select_dtypes(include = np.number).columns:
             min_val = df[col].min()
             if min_val <= 0:
                shift_value = abs(min_val) + 0.001
                df[col] += shift_value
                self.log.append(f"Shifted '{col}' by {shift_value} to handle negatives.")
        return df




    def lat_long_compute(self, df):
        from flood_tool.geo import get_gps_lat_long_from_easting_northing
        coordinates_lat = get_gps_lat_long_from_easting_northing(df[])
        coordinates_df = pd.DataFrame({
            'latitude': coordinates_lat[0],
            'longitude': coordinates_lat[1]
        })
        df= pd.concat([df, coordinates_df], axis=1)
        df.drop(columns = ['easting', 'northing'], inplace = True)
        return df
    
    

    def impute_watercourse_3d(self, df, lat_col="latitude", lon_col="longitude", elev_col="elevation", watercourse_col="nearestWatercourse"):
        """Imputes missing watercourse values using 3D Cartesian distance."""
        df["cartesian"] = df.apply(lambda row: lat_lon_to_cartesian(row[lat_col], row[lon_col], row[elev_col]), axis=1)
        known = df[df[watercourse_col].notna()]
        unknown = df[df[watercourse_col].isna()]
        tree = KDTree(np.array(known["cartesian"].tolist()))
        distances, indices = tree.query(np.array(unknown["cartesian"].tolist()))
        unknown[watercourse_col] = known.iloc[indices][watercourse_col].values
        self.log.append(f"Imputed missing 'nearestWatercourse' using 3D Cartesian distance.")
        return pd.concat([known, unknown]).drop(columns=["cartesian"])

    def handle_missing_numeric(self, df, numeric_col="medianPrice"):
        """Handles missing numeric data by replacing with the median."""
        if numeric_col in df.columns:
            median_value = df[numeric_col].median()
            df[numeric_col].fillna(median_value, inplace=True)
            self.log.append(f"Imputed missing values in '{numeric_col}' with its median value ({median_value}).")
        return df

    def scale_numeric_features(self, df, numeric_columns, scaling_type="standard"):
        """Scales numeric features using the specified scaler."""
        if scaling_type == "standard":
            self.scaler = StandardScaler()
        elif scaling_type == "minmax":
            self.scaler = MinMaxScaler()
        elif scaling_type == "robust":
            self.scaler = RobustScaler()
        else:
            raise ValueError("Unsupported scaling_type. Use 'standard', 'minmax', or 'robust'.")

        numeric_columns = [col for col in numeric_columns if col in df.columns]
        if numeric_columns:
            df[numeric_columns] = self.scaler.fit_transform(df[numeric_columns])
            self.log.append(f"Scaled numeric columns: {numeric_columns} using {scaling_type} scaling.")
        return df
    def feature_engineering(self, df, sector_data=None, station_data=None):
        """Adds derived features like proximity risk and population density."""
        # Calculate proximity risk
        if "distanceToWatercourse" in df.columns and "elevation" in df.columns:
            df["proximity_risk"] = df["distanceToWatercourse"] / (df["elevation"] + 1)
            self.log.append("Added 'proximity_risk' feature.")

        # Add population density from sector data
        if sector_data is not None and "households" in sector_data.columns and "population" in sector_data.columns:
            df["population_density"] = sector_data["population"] / sector_data["households"]
            self.log.append("Added 'population_density' feature from sector data.")

        return df

    def interaction_general(self, df):
        """Generates interaction features based on categorical and numeric data."""
        # Initialize label encoders
        label_encoder_soil = LabelEncoder()
        label_encoder_watercourse = LabelEncoder()

        # Encode 'soilType' and 'nearestWatercourse'
        df['soilType_encoded'] = label_encoder_soil.fit_transform(df['soilType'])
        df['nearestWatercourse_encoded'] = label_encoder_watercourse.fit_transform(df['nearestWatercourse'])

        # Binning for elevation and distance to watercourse
        bins_elevation = pd.qcut(df['elevation'], q=4, labels=["Low", "Mid", "High", "Very High"])
        bins_distance = pd.qcut(df['distanceToWatercourse'], q=4, labels=["Low", "Mid", "High", "Very High"])

        # Interaction terms
        df['soilType/Elevation'] = df['soilType_encoded'].astype(str) + '/' + bins_elevation.astype(str)
        df['distanceToWatercourse/nearestWatercourse'] = bins_distance.astype(str) + '/' + df['nearestWatercourse_encoded'].astype(str)

        # Drop intermediate columns
        df.drop(columns=['soilType_encoded', 'nearestWatercourse_encoded'], inplace=True)

        self.log.append("Added interaction terms and encoded features.")
        return df

    def preprocess(self, df, sector_data=None, district_data=None,
                   lat_col="latitude", lon_col="longitude", elev_col="elevation",
                   watercourse_col="nearestWatercourse", numeric_col="medianPrice",
                   numeric_columns=None, scaling_type="standard"):
        """Executes the complete preprocessing pipeline."""
        
        # Drop duplicates
        df = self.drop_duplicates(df)
        
        # Standardize postcodes and extract derived columns
        if "postcode" in df.columns:
            df["postcode"] = df["postcode"].apply(self.standardize_postcode)
            self.log.append("Standardized 'postcode' column.")
        df = self.preprocess_postcode(df)

        # Merge sector and district data
        if sector_data is not None:
            df = self.merge_sector_data(df, sector_data)
        if district_data is not None:
            df = self.merge_district_data(df, district_data)

        # Impute missing categorical and numeric data
        if watercourse_col in df.columns:
            df = self.impute_watercourse_3d(df, lat_col, lon_col, elev_col, watercourse_col)
        df = self.handle_missing_numeric(df, numeric_col=numeric_col)

         # Feature engineering
        df = self.feature_engineering(df, sector_data, station_data)

        # Generate interaction terms
        df = self.interaction_general(df)

        # Scale numeric features
        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
        df = self.scale_numeric_features(df, numeric_columns, scaling_type=scaling_type)

        return df

    def generate_report(self):
        """Generates a summary of preprocessing actions."""
        return "\n".join(self.log)

In [None]:
###Final

class FloodRiskPreprocessingPipeline:
    """Simplified preprocessing pipeline for flood risk datasets."""

    def __init__(self):
        self.scaler = None

    def drop_duplicates(self, df):
        """Drops duplicate rows from the dataset."""
        df.drop_duplicates(inplace=True)
        return df

    @staticmethod
    def standardize_postcode(postcode):
        """Standardizes a postcode to the format 'SW1A 1AA'."""
        if isinstance(postcode, str):
            postcode = postcode.strip().upper()
            match = re.match(r'^([A-Z]{1,2}[0-9][A-Z0-9]?)(\s*?)([0-9][A-Z]{2})$', postcode)
            if match:
                return f"{match.group(1)} {match.group(3)}"
        return postcode

    def preprocess_postcode(self, df):
        """Splits the 'postcode' column into 'postcodeSector' and 'postcodeDistrict'."""
        if "postcode" in df.columns:
            df["postcodeSector"] = df["postcode"].str.split(" ").str[0]
            df["postcodeDistrict"] = df["postcode"].str.extract(r'^([A-Z]{1,2}\d{1,2}\s?\d?)')[0]
        return df

    def merge_sector_data(self, df, sector_data):
        """Merges sector-level data."""
        if "postcodeSector" in df.columns:
            df = df.merge(sector_data, on="postcodeSector", how="left")
        return df

    def merge_district_data(self, df, district_data):
        """Merges district-level data."""
        if "postcodeDistrict" in df.columns:
            df = df.merge(district_data, on="postcodeDistrict", how="left")
        return df

    def increase_negative_for_log(self, df):
        """Shifts numeric columns with negative values for log scaling."""
        for col in df.select_dtypes(include=np.number).columns:
            min_val = df[col].min()
            if min_val <= 0:
                df[col] += abs(min_val) + 0.001
        return df

    def lat_long_compute(self, df):
        from flood_tool.geo import get_gps_lat_long_from_easting_northing  # Import the required method
        # Extract latitude and longitude using the easting and northing columns
        coordinates_lat_long = df.apply(
            lambda row: get_gps_lat_long_from_easting_northing(row['easting'], row['northing']), axis=1
    )
        # Split the resulting tuples into separate latitude and longitude columns
        df['latitude'] = coordinates_lat_long.apply(lambda x: x[0])
        df['longitude'] = coordinates_lat_long.apply(lambda x: x[1])
        # Drop easting and northing columns after conversion
        df.drop(columns=['easting', 'northing'], inplace=True)
        return df

    
    def impute_watercourse_3d(self, df, lat_col="latitude", lon_col="longitude", elev_col="elevation", watercourse_col="nearestWatercourse"):
        """Imputes missing watercourse values using 3D Cartesian distance."""
        # Convert known and unknown locations to Cartesian coordinates
        df["cartesian"] = df.apply(lambda row: lat_lon_to_cartesian(row[lat_col], row[lon_col], row[elev_col]), axis=1)
    
        # Split data into known and unknown watercourses
        known = df[df[watercourse_col].notna()]
        unknown = df[df[watercourse_col].isna()]
    
        # Build KDTree for known Cartesian coordinates
        known_coords = np.array(known["cartesian"].tolist())
        tree = KDTree(known_coords)
    
        # Query nearest neighbors for unknown Cartesian coordinates
        unknown_coords = np.array(unknown["cartesian"].tolist())
        distances, indices = tree.query(unknown_coords, k=1)
    
        # Flatten indices array to 1D
        indices = indices.flatten()
    
        # Impute missing values with the nearest neighbor's watercourse name
        unknown[watercourse_col] = known.iloc[indices][watercourse_col].values
    
        # Combine known and updated unknown datasets
        df = pd.concat([known, unknown]).drop(columns=["cartesian"])
    
        return df
    def impute_watercourse_3d(self, df, lat_col="latitude", lon_col="longitude", elev_col="elevation", watercourse_col="nearestWatercourse"):
    """Imputes missing watercourse values using 3D Cartesian distance."""
    
    # Ensure required columns exist
    required_cols = [lat_col, lon_col, elev_col, watercourse_col]
    for col in required_cols[:-1]:  # Exclude the categorical column for now
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    
    # Drop rows with NaN in coordinate columns
    df = df.dropna(subset=[lat_col, lon_col, elev_col])
    
    # Convert coordinates to Cartesian
    df["cartesian"] = df.apply(lambda row: lat_lon_to_cartesian(row[lat_col], row[lon_col], row[elev_col]), axis=1)
    
    # Split data into known and unknown watercourses
    known = df[df[watercourse_col].notna()]
    unknown = df[df[watercourse_col].isna()]
    
    if known.empty:
        raise ValueError("No known data points to build KDTree.")
    if unknown.empty:
        print("No missing watercourse values to impute.")
        return df.drop(columns=["cartesian"])  # Drop the helper column and return

    # Build KDTree for known Cartesian coordinates
    known_coords = np.array(known["cartesian"].tolist())
    tree = KDTree(known_coords)

    # Query nearest neighbors for unknown Cartesian coordinates
    unknown_coords = np.array(unknown["cartesian"].tolist())
    distances, indices = tree.query(unknown_coords, k=1)

    # Safely update the unknown DataFrame
    unknown = unknown.copy()  # Avoid SettingWithCopyWarning
    unknown[watercourse_col] = known.iloc[indices.flatten()][watercourse_col].values

    # Combine known and updated unknown datasets
    df = pd.concat([known, unknown]).drop(columns=["cartesian"])
    return df
    def handle_missing_numeric(self, df, numeric_col="medianPrice"):
        """Handles missing numeric data by replacing with the median."""
        if numeric_col in df.columns:
            median_value = df[numeric_col].median()
            df[numeric_col].fillna(median_value, inplace=True)
        return df

    def scale_numeric_features(self, df, numeric_columns, scaling_type="standard"):
        """Scales numeric features using the specified scaler."""
        if scaling_type == "standard":
            self.scaler = StandardScaler()
        elif scaling_type == "minmax":
            self.scaler = MinMaxScaler()
        elif scaling_type == "robust":
            self.scaler = RobustScaler()
        else:
            raise ValueError("Unsupported scaling_type. Use 'standard', 'minmax', or 'robust'.")

        numeric_columns = [col for col in numeric_columns if col in df.columns]
        if numeric_columns:
            df[numeric_columns] = self.scaler.fit_transform(df[numeric_columns])
        return df

    def feature_engineering(self, df, sector_data=None):
        """Adds derived features."""
        if "distanceToWatercourse" in df.columns and "elevation" in df.columns:
            df["proximity_risk"] = df["distanceToWatercourse"] / (df["elevation"] + 1)

        if sector_data is not None and "population" in sector_data.columns and "households" in sector_data.columns:
            df["population_density"] = sector_data["population"] / sector_data["households"]
        return df
    def interaction_general(self, df):
        """Generates interaction features based on categorical and numeric data."""
        # Initialize label encoders
        label_encoder_soil = LabelEncoder()
        label_encoder_watercourse = LabelEncoder()

        # Encode 'soilType' and 'nearestWatercourse'
        df['soilType_encoded'] = label_encoder_soil.fit_transform(df['soilType'])
        df['nearestWatercourse_encoded'] = label_encoder_watercourse.fit_transform(df['nearestWatercourse'])

        # Binning for elevation and distance to watercourse
        bins_elevation = pd.qcut(df['elevation'], q=4, labels=["Low", "Mid", "High", "Very High"])
        bins_distance = pd.qcut(df['distanceToWatercourse'], q=4, labels=["Low", "Mid", "High", "Very High"])

        # Interaction terms
        df['soilType/Elevation'] = df['soilType_encoded'].astype(str) + '/' + bins_elevation.astype(str)
        df['distanceToWatercourse/nearestWatercourse'] = bins_distance.astype(str) + '/' + df['nearestWatercourse_encoded'].astype(str)

        # Drop intermediate columns
        df.drop(columns=['soilType_encoded', 'nearestWatercourse_encoded'], inplace=True)

        self.log.append("Added interaction terms and encoded features.")
        return df

    def preprocess(self, df, sector_data=None, district_data=None, lat_col="latitude", lon_col="longitude", elev_col="elevation",
                   watercourse_col="nearestWatercourse", numeric_col="medianPrice", numeric_columns=None, scaling_type="standard"):
        """Executes the complete preprocessing pipeline."""
        df = self.drop_duplicates(df)
        df = self.preprocess_postcode(df)
        df = self.merge_sector_data(df, sector_data)
        df = self.merge_district_data(df, district_data)
        df = self.lat_long_compute(df)
        df = self.impute_watercourse_3d(df, lat_col, lon_col, elev_col, watercourse_col)
        df = self.handle_missing_numeric(df, numeric_col)
        df = self.feature_engineering(df, sector_data)
        df = self.interaction_general(df)
        df = self.scale_numeric_features(df, numeric_columns, scaling_type)
        return df


In [28]:
import sys
import os
print(os.getcwd())
# Add the parent directory of flood_tool to the Python path
sys.path.append('/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee')

# Test the import
from flood_tool.geo import get_gps_lat_long_from_easting_northing
print("Import successful!")



/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool
Import successful!


In [25]:
## Preprocessing Test
#Load all files
postcodes_missing = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/example_data/postcodes_missing_data.csv")
postcodes_labelled = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/resources/postcodes_labelled.csv")
postcodes_unlabelled = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/example_data/postcodes_unlabelled.csv")
sector_data = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/resources/sector_data.csv")
station_data = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/resources/stations.csv")
typical_day = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/example_data/typical_day.csv")
wet_day = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/example_data/wet_day.csv")
district_data = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/resources/district_data.csv")

In [29]:
# Initialize the pipeline
pipeline = FloodRiskPreprocessingPipeline()

# Run the preprocessing pipeline
df_processed = pipeline.preprocess(
    df=postcodes_labelled,
    sector_data=sector_data,
    district_data=district_data,
    lat_col="latitude",
    lon_col="longitude",
    elev_col="elevation",
    watercourse_col="nearestWatercourse",
    numeric_col="medianPrice",
    numeric_columns=["distanceToWatercourse", "elevation", "medianPrice"],
    scaling_type="standard"
)

# Display processed DataFrame
print(df_processed.head())  # Displays the first 5 rows of the processed DataFrame


ValueError: data must be of shape (n, m), where there are n points of dimension m

In [None]:
data = postcodes_labelled 
print("DataFrame Info:")
print(data.info())

print("\nCategorical Columns:")
print(categorical_columns)

print("\nNumeric Columns:")
print(numeric_columns)


In [None]:
for col in categorical_columns + numeric_columns:
    if col not in data.columns:
        raise ValueError(f"Column '{col}' is missing from the dataset.")


Define Dataset and Columns

In [None]:
# Load datasets
postcodes_labelled = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/resources/postcodes_labelled.csv")
postcodes_unlabelled = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/example_data/postcodes_unlabelled.csv")
sector_data = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/resources/sector_data.csv")
station_data = pd.read_csv("/Users/yixuanyan/Desktop/edsml/Term1/jubliee/ads-deluge-jubilee/flood_tool/resources/stations.csv")



# Define categorical and numeric columns
categorical_columns = ['soilType', 'localAuthority', 'nearestWatercourse']
numeric_columns = ['easting','northing','elevation', 'distanceToWatercourse', 'medianPrice','riskLabel','medianPrice','historicallyFlooded']


Initialize and Run the Pipeline

In [None]:
pipeline = FloodRiskPreprocessingPipeline()

In [None]:
processed_labelled = pipeline.preprocess(
    df=postcodes_labelled,
    categorical_columns=categorical_columns,
    numeric_columns=numeric_columns,
    scaling_type="standard",  # Use "minmax" for MinMaxScaler
    imputation_method="median",  # Options: "median", "mean"
    sector_data=sector_data,
    station_data=station_data
)

In [None]:
print(df[categorical_columns].head())
print(df[categorical_columns].dtypes)
