### Importing libraries

In [4]:
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier as rf
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl

### Functions

In [5]:
def prepare_data(data):
    """
    Prepare spatial data for analysis by performing several preprocessing steps.

    Parameters:
    - data: DataFrame containing spatial data, including a 'geometry' column.

    Returns:
    - Processed DataFrame with additional features and unnecessary columns dropped.

    Steps:
    1. Drop unnecessary features like 'index' and 'geometry'.
    2. Convert the 'geometry' column to the EPSG:32633 coordinate reference system.
    3. Calculate the area of each geometry and add it as a new column 'area'.
    4. Calculate the perimeter of each geometry and add it as a new column 'perimeter'.
    5. Calculate the x-coordinate of the centroid of each geometry and add it as a new column 'centroid_x'.
    6. Calculate the y-coordinate of the centroid of each geometry and add it as a new column 'centroid_y'.
    7. Calculate the boundary length of each geometry and add it as a new column 'boundary_length'.
    8. Drop the columns specified in 'drop_features' list.

    """
    # Define features to drop
    drop_features = ['index', 'geometry']
    
    # Convert the 'geometry' column to EPSG:32633 coordinate reference system
    data['geometry'] = data['geometry'].to_crs('EPSG:32633')
    
    # Calculate area of each geometry and add it as a new column 'area'
    data['area'] = data[['geometry']].area
    
    # Calculate perimeter of each geometry and add it as a new column 'perimeter'
    data['perimeter'] = data[['geometry']].length
    
    # Calculate x-coordinate of centroid of each geometry and add it as a new column 'centroid_x'
    data['centroid_x'] = data[['geometry']].centroid.x
    
    # Calculate y-coordinate of centroid of each geometry and add it as a new column 'centroid_y'
    data['centroid_y'] = data[['geometry']].centroid.y
    
    # Calculate boundary length of each geometry and add it as a new column 'boundary_length'
    data['boundary_length'] = data[['geometry']].boundary.length
    
    # Drop unnecessary features
    data = data.drop(columns=drop_features)
    
    return data

In [6]:
def filter_train_data(data_filtered):
    """
    Filter the training data by removing rows with too many NaN values and ensuring specified columns are not None.

    Parameters:
    - data_filtered: DataFrame containing the training data.

    Returns:
    - Filtered DataFrame with rows removed based on the specified criteria.

    Steps:
    1. Identify NaN values in the DataFrame.
    2. Count the number of NaN values in each row.
    3. Select rows with fewer than 5 NaN values and ensure specified columns are not None.
    4. Filter the DataFrame to keep only the selected rows.

    """
    # Check for NaN values in the DataFrame
    nan_values = data_filtered.isna()

    # Count the number of NaN values in each row
    nan_count_per_row = nan_values.sum(axis=1)

    # Select rows with fewer than 5 NaN values and ensure specified columns are not None
    index_maintain = data_filtered[(nan_count_per_row < 5) & (~data_filtered[['date0','date1','date2','date3','date4']].isna().any(axis=1))].index 

    # Filter the DataFrame to keep only the selected rows
    data_filtered = data_filtered.iloc[index_maintain, :]

    return data_filtered

In [7]:
def count_nan_cols(df):
    """
    Count the number of missing values in each column of a DataFrame.

    Parameters:
    - df: DataFrame to analyze.

    Returns:
    - None

    Steps:
    1. Use isnull() to create a DataFrame of boolean values indicating missing values.
    2. Use sum() to calculate the total number of missing values in each column.
    3. Sort the result in descending order.
    4. Print the columns with the most missing values.

    """
    # Use isnull() to create a DataFrame of boolean values indicating missing values
    missing_values = df.isnull()

    # Use sum() to calculate the total number of missing values in each column
    missing_counts = missing_values.sum()

    # Sort the result in descending order
    missing_counts_sorted = missing_counts.sort_values(ascending=False)

    # Print the columns with the most missing values
    print("Columns with the most missing values:")
    print(missing_counts_sorted[missing_counts_sorted > 0])  # Adjust the number as needed

In [8]:
def numerical_imputer(dataframe):    
    """
    Impute missing numerical values in a DataFrame using the IterativeImputer.

    Parameters:
    - dataframe: DataFrame containing numerical columns with missing values.

    Returns:
    - DataFrame: DataFrame with missing numerical values imputed.

    Steps:
    1. Select numerical columns from the DataFrame.
    2. Initialize an IterativeImputer.
    3. Fit the imputer on rows without missing numerical values.
    4. Transform rows with missing numerical values using the imputer.
    5. Return the DataFrame with missing numerical values imputed.

    """
    numerical_columns_selector = selector(dtype_include=[int, float])
    numerical_columns = numerical_columns_selector(dataframe)
    imputer = KNNImputer(n_neighbors=3)
    # imputer = IterativeImputer()
    imputer.fit(dataframe[dataframe[numerical_columns].notna().all(axis=1)])
    dataframe[dataframe[numerical_columns].isna().any(axis=1)] = imputer.transform(dataframe[dataframe[numerical_columns].isna().any(axis=1)])
    return dataframe

In [9]:
def create_data(data):
    """
    Create balanced data for training by oversampling with SMOTE.

    Parameters:
    - data: DataFrame containing features and target variable.

    Returns:
    - X_data: Features of the balanced dataset.
    - y_data: Target variable of the balanced dataset.

    Steps:
    1. Define a dictionary specifying the desired sampling strategy for each class.
    2. Separate features and target variable from the DataFrame.
    3. Initialize SMOTE with the specified sampling strategy.
    4. Oversample the minority classes to balance the dataset.
    5. Return the balanced features and target variable.

    """
    # Define the desired sampling strategy for each class
    strategy = {0:40000, 1:40000, 3:150000, 4:10000, 5:5000}

    # Separate features and target variable
    X_data = data.drop(columns=['change_type'])
    y_data = data['change_type']

    # Initialize SMOTE with the specified sampling strategy
    oversample = SMOTE(sampling_strategy=strategy)

    # Oversample the minority classes to balance the dataset
    X_data, y_data = oversample.fit_resample(X_data, y_data)

    # Return the balanced features and target variable
    return X_data, y_data

In [10]:
def fix_column_names(data):
    """
    Fix column names in the DataFrame.

    Parameters:
    - data: DataFrame with potentially incorrect column names.

    Returns:
    - Fixed DataFrame with corrected column names.

    Steps:
    1. Rename columns to correct any inconsistencies.
    2. Return the DataFrame with updated column names.

    """
    # Rename columns to correct any inconsistencies
    data.rename(columns={'img_red_mean_date5':'img_red_mean_date0',
                         'img_green_mean_date5':'img_green_mean_date0',
                         'img_blue_mean_date5':'img_blue_mean_date0',
                         'img_red_std_date5':'img_red_std_date0',
                         'img_green_std_date5':'img_green_std_date0',
                         'img_blue_std_date5':'img_blue_std_date0'}, inplace=True)

    # Return the DataFrame with updated column names
    return data

In [11]:
def unique_values(data_enumerate, change_status_columns, date_cols):
    """
    Replace categorical values with their respective enumeration.

    Parameters:
    - data_enumerate: DataFrame containing categorical columns to be enumerated.
    - change_status_columns: List of column names containing change status.
    - date_cols: List of column names containing dates.

    Returns:
    - DataFrame with categorical values replaced by their enumeration.
    - List of unique dates.
    - List of unique change statuses.

    Steps:
    1. Get unique dates and change statuses.
    2. Create dictionaries to map unique values to their enumeration.
    3. Replace categorical values with their respective enumeration.
    4. Return the DataFrame with replaced values, along with lists of unique dates and change statuses.

    """
    # Get unique dates and change types
    unique_dates = sorted(data_enumerate[date_cols].stack().unique().tolist())
    unique_change_status = data_enumerate[change_status_columns].stack().unique().tolist()

    # Create dictionaries with enumeration
    dict_dates = {date: index for index, date in enumerate(unique_dates)}
    dict_changes = {change: index for index, change in enumerate(unique_change_status)}

    # Replace categorical values with their enumeration
    data_enumerate[date_cols] = data_enumerate[date_cols].replace(dict_dates)
    data_enumerate[change_status_columns] = data_enumerate[change_status_columns].replace(dict_changes)

    # Return the DataFrame with replaced values, along with lists of unique dates and change statuses
    return data_enumerate, unique_dates, unique_change_status

In [12]:
def sort_dates_values(data_sorted, unique_dates, unique_change_status, date_cols, change_status_columns, value_columns):
    """
    Sort the date values and corresponding columns in the DataFrame.

    Parameters:
    - data_sorted: DataFrame to be sorted.
    - unique_dates: List of unique dates.
    - unique_change_status: List of unique change statuses.
    - date_cols: List of column names containing dates.
    - change_status_columns: List of column names containing change status.
    - value_columns: List of column names containing values.

    Returns:
    - Sorted DataFrame with dates and values.

    Steps:
    1. Reset the index of the DataFrame.
    2. Get a mask where each element is True if all date columns in the row are monotonically increasing.
    3. Get the indices where any of the rows do not have monotonically increasing dates.
    4. Iterate over non-monotonic indices.
        - Sort dates and values together.
        - Update the DataFrame with sorted values and dates.
    5. Create dictionaries with enumeration.
    6. Replace categorical values with their respective enumeration.
    7. Return the sorted DataFrame.

    """
    # Reset the index of the DataFrame
    data_sorted = data_sorted.reset_index(drop=True)

    # Get the mask where each element is True if all date columns in the row are monotonically increasing
    monotonic_mask = data_sorted[date_cols].apply(lambda row: row.is_monotonic_increasing, axis=1)

    # Get the indices where any of the rows do not have monotonically increasing dates
    non_monotonic_indices = data_sorted.index[~monotonic_mask].tolist()

    # Iterate over non-monotonic indices
    for index in non_monotonic_indices:
        row = data_sorted.loc[index]
        if index % 50000 == 0:
            print(index)
        dates = row[date_cols]

        img_values = [row[value_columns].values[i:i + 6] for i in range(0, len(row[value_columns].values), 6)]
        status_values = row[change_status_columns]

        # Sort dates and values together
        sorted_indices = np.argsort(dates)
        sorted_dates = np.array(dates)[sorted_indices]
        sorted_values = np.array(img_values)[sorted_indices]
        sorted_status = np.array(status_values)[sorted_indices]

        # Update the DataFrame with sorted values and dates
        data_sorted.loc[index, date_cols] = sorted_dates
        data_sorted.loc[index, value_columns] = np.reshape(sorted_values[:, :6], 30)
        data_sorted.loc[index, change_status_columns] = sorted_status

    # Create dictionaries with enumeration
    dict_dates = {index: date for index, date in enumerate(unique_dates)}
    dict_changes = {index: change for index, change in enumerate(unique_change_status)}

    # Replace categorical values with their respective enumeration
    data_sorted[date_cols] = data_sorted[date_cols].replace(dict_dates)
    data_sorted[change_status_columns] = data_sorted[change_status_columns].replace(dict_changes)

    # Return the sorted DataFrame
    return data_sorted

In [13]:
def fix_date_columns(data, change_status_columns, name=''):
    """
    Fix missing values in date columns and convert them to datetime format.

    Parameters:
    - data: DataFrame containing the data.
    - change_status_columns: List of column names containing change status.
    - name: Name to be appended to the CSV file.

    Returns:
    - DataFrame with fixed date columns.

    Steps:
    1. Define the date columns.
    2. Fill missing values with NaT, convert the DataFrame to object type, and fill remaining NaN values.
    3. Use SimpleImputer to fill missing values in date columns with the most frequent value.
    4. Convert date columns to datetime format.
    5. Extract unique dates and change types.
    6. Generate value column names.
    7. Sort dates and values together.
    8. Save the DataFrame to a CSV file.
    9. Return the DataFrame with fixed date columns.

    """
    # Define the date columns
    date_cols = ['date0', 'date1', 'date2', 'date3', 'date4']

    # Fill missing values with NaT, convert to object type, and fill remaining NaN values
    data = data.fillna(pd.NaT).astype(object).fillna(np.nan)

    # Use SimpleImputer to fill missing values in date columns with the most frequent value
    date_imputer = SimpleImputer(strategy='most_frequent')
    data[date_cols] = date_imputer.fit_transform(data[date_cols])

    # Convert date columns to datetime format
    data.loc[:, date_cols] = data.loc[:, date_cols].apply(pd.to_datetime, format="%d-%m-%Y")

    # Extract unique dates and change types
    data, unique_dates, unique_change_types = unique_values(data, change_status_columns, date_cols)

    # Generate value column names
    value_columns = np.array([[f'img_red_mean_date{i}',
                               f'img_green_mean_date{i}',
                               f'img_blue_mean_date{i}',
                               f'img_red_std_date{i}',
                               f'img_green_std_date{i}',
                               f'img_blue_std_date{i}'] for i in range(0, 5)]).reshape(30)

    # Sort dates and values together
    data_fixed_type = sort_dates_values(data, unique_dates, unique_change_types, date_cols, change_status_columns,
                                        value_columns)

    # Save the DataFrame to a CSV file
    data_fixed_type.to_csv(f'../data/data_fixed_type{name}.csv')

    # Return the DataFrame with fixed date columns
    return data_fixed_type

In [14]:
def fix_change_status(data, change_status_columns):
    """
    Replace change status strings with numerical values.

    Parameters:
    - data: DataFrame containing the data.
    - change_status_columns: List of column names containing change status.

    Returns:
    - DataFrame with change status replaced by numerical values.

    Steps:
    1. Define a dictionary to map change status strings to numerical values.
    2. Replace change status strings with numerical values.
    3. Return the DataFrame with fixed change status.

    """
    # Define a dictionary to map change status strings to numerical values
    dict_change_status = {
        'Greenland': 0, 'Prior Construction': 1, 'Land Cleared': 2, 'Materials Dumped': 3, 'Materials Introduced': 4,
        'Excavation': 5, 'Construction Started': 6, 'Construction Midway': 7, 'Construction Done': 8, 'Operational': 9
    }

    # Replace change status strings with numerical values
    data.loc[:, change_status_columns] = data.loc[:, change_status_columns].replace(dict_change_status)

    # Return the DataFrame with fixed change status
    return data

In [15]:
def dumb_hot_encoder(df, col):
    """
    Perform one-hot encoding on a column containing comma-separated values.

    Parameters:
    - df: DataFrame containing the data.
    - col: Name of the column to encode.

    Returns:
    - DataFrame with one-hot encoded columns.

    Steps:
    1. Split the column values by comma and remove rows with ['N', 'A'] values.
    2. Perform one-hot encoding on the split values.
    3. Replace True/False with 1/0.
    4. Drop the 'nan' column if it exists.

    """
    # Preprocess the column
    df_col_nan = df[df[col].isna()].drop(columns=col)
    df_col_not_nan = df[~df[col].isna()]
    treated_col_type = df_col_not_nan[col].astype(str).str.split(',')
    treated_col_type = treated_col_type[~treated_col_type.apply(lambda x: x == ['N', 'A'])]
    label_list = treated_col_type.explode()

    # Perform one-hot encoding
    one_hot_encoded = pd.get_dummies(label_list, prefix=col, dummy_na=True).groupby(level=0).max().astype(int)
    df_col_not_nan.loc[:, one_hot_encoded.columns] = one_hot_encoded
    df_col_not_nan.drop(columns=[col], inplace=True)

    # Replace True/False with 1/0
    df_col_not_nan.replace({True: 1, False: 0}, inplace=True)

    # Drop 'nan' column if exists
    df_col_not_nan.drop(columns=np.nan, errors='ignore', inplace=True)

    # Concatenate dataframes
    df = pd.concat([df_col_not_nan, df_col_nan])

    return df

In [16]:
def filter_train_data(data_filtered):
    """
    Filter the training data to remove rows with excessive NaN values.

    Parameters:
    - data_filtered: DataFrame containing the training data.

    Returns:
    - Filtered DataFrame.

    Steps:
    1. Check for NaN values in the DataFrame.
    2. Count the number of NaN values in each row.
    3. Select rows with fewer than 5 NaN values and where at least one of the specified date columns is not NaN.

    """
    # Check for NaN values
    nan_values = data_filtered.isna()

    # Count the number of NaN values in each row
    nan_count_per_row = nan_values.sum(axis=1)

    # Select rows with fewer than 5 NaN values and where at least one of the specified date columns is not NaN
    index_maintain = data_filtered[(nan_count_per_row < 5) & (~data_filtered[['date0', 'date1', 'date2', 'date3', 'date4']].isna().any(axis=1))].index

    # Filter rows
    data_filtered = data_filtered.iloc[index_maintain, :]

    return data_filtered

In [17]:
def add_date_features(data_change_img, date_cols):
    """
    Add date-related features to the DataFrame.

    Parameters:
    - data_change_img: DataFrame containing the data.
    - date_cols: List of column names containing date information.

    Returns:
    - DataFrame with added date features.

    Steps:
    1. Convert date columns to datetime format.
    2. Calculate time differences between consecutive date columns.
    3. Iterate over date ranges and color-statistic combinations to calculate change ratios.
    4. Calculate change in status between consecutive dates.

    """
    try:
        # Convert date columns to datetime format
        data_change_img.loc[:, date_cols] = data_change_img.loc[:, date_cols].apply(pd.to_datetime, format="%Y-%m-%d")
    except:
        data_change_img.loc[:, date_cols] = data_change_img.loc[:, date_cols].apply(pd.to_datetime, format="%d-%m-%Y")

    # Calculate time differences between consecutive date columns
    for date in range(0, 4):
        data_change_img[f'diff_date_{date + 1}_{date}'] = (data_change_img[f'date{date + 1}'] - data_change_img[f'date{date}']).apply(lambda x: int(str(x).split(" ")[0]))

    # Define lists for image colors and statistical values
    list_colors = ['green', 'blue', 'red']
    list_values = ['mean', 'std']

    # Iterate over date ranges and color-statistic combinations
    for date in range(0, 4):
        for color in list_colors:
            for value in list_values:
                col1 = f'img_{color}_{value}_date{date}'
                col2 = f'img_{color}_{value}_date{date + 1}'

                # Calculate change ratios for consecutive dates
                data_change_img[f'img_{color}_{value}_change_{date + 1}_{date}'] = (data_change_img[col2] - data_change_img[col1]) / data_change_img[f'diff_date_{date + 1}_{date}']

        col3 = f'change_status_date{date}'
        col4 = f'change_status_date{date + 1}'
        # Calculate change in status between consecutive dates
        data_change_img[f'change_status_date_{date + 1}_{date}'] = data_change_img[col4] - data_change_img[col3]

    return data_change_img

In [18]:
def add_important_features(data):
    """
    Add important features to the DataFrame using different strategies.

    Parameters:
    - data: DataFrame containing the data.

    Returns:
    - DataFrame with added important features.

    Strategies:
    1. Interaction terms: Sum of important features.
    2. Polynomial features: Square of each important feature.
    3. Transformations: Square root of absolute values of important features.
    4. Statistical aggregations: Mean and standard deviation of important features.

    """
    important_features = ['perimeter', 'centroid_x', 'centroid_y', 'boundary_length', 'change_status_date0', 'change_status_date1', 'change_status_date2', 'change_status_date3', 'change_status_date4', 'diff_date_1_0', 'diff_date_2_1', 'diff_date_3_2', 'diff_date_4_3', 'img_blue_mean_date0', 'img_blue_mean_date1', 'img_blue_mean_date2', 'img_blue_mean_date3', 'img_blue_mean_date4']
    
    # Strategy 1: Interaction terms
    data['interaction_feature'] = data[important_features].sum(axis=1)

    # Strategy 2: Polynomial features
    for feature in important_features:
        data[f'{feature}_squared'] = data[feature] ** 2

    # Strategy 3: Transformations
    for feature in important_features:
        data[f'sqrt_{feature}'] = np.sqrt(abs(data[feature]))

    # Strategy 4: Statistical aggregations
    data['mean_feature'] = data[important_features].mean(axis=1)
    data['std_feature'] = data[important_features].std(axis=1)
    
    return data

In [19]:
def normalize_data_1(data):
    """
    Normalize the data using standardization.

    Parameters:
    - data: DataFrame containing the data.

    Returns:
    - DataFrame with normalized data.

    """
    data_norm = data.copy()
    # standarization
    v = np.array([[f'img_red_mean_date{i}',
                    f'img_green_mean_date{i}',
                    f'img_blue_mean_date{i}',
                    f'img_red_std_date{i}',
                    f'img_green_std_date{i}',
                    f'img_blue_std_date{i}'] for i in range(0,5)]).reshape(30)
    geo = np.array(['perimeter', 'area', 'centroid_x', 'centroid_y'])
    new_features = ['interaction_feature','perimeter_squared','centroid_x_squared','centroid_y_squared','boundary_length_squared','change_status_date0_squared',
                    'change_status_date1_squared','change_status_date2_squared','change_status_date3_squared','change_status_date4_squared','diff_date_1_0_squared',
                    'diff_date_2_1_squared','diff_date_3_2_squared','diff_date_4_3_squared','img_blue_mean_date0_squared','img_blue_mean_date1_squared','img_blue_mean_date2_squared',
                    'img_blue_mean_date3_squared','img_blue_mean_date4_squared','sqrt_perimeter','sqrt_centroid_x','sqrt_centroid_y','sqrt_boundary_length','sqrt_change_status_date0',
                    'sqrt_change_status_date1','sqrt_change_status_date2','sqrt_change_status_date3','sqrt_change_status_date4','sqrt_diff_date_1_0','sqrt_diff_date_2_1','sqrt_diff_date_3_2',
                    'sqrt_diff_date_4_3','sqrt_img_blue_mean_date0','sqrt_img_blue_mean_date1','sqrt_img_blue_mean_date2','sqrt_img_blue_mean_date3','sqrt_img_blue_mean_date4','mean_feature','std_feature']
    v_geo = np.concatenate([v, geo, new_features])
    data_norm[v_geo] = (data_norm[v_geo] - data_norm[v_geo].mean())/data_norm[v_geo].std()
    return data_norm

In [20]:
def normalize_data_2(data):
    """
    Normalize the data using standardization.

    Parameters:
    - data: DataFrame containing the data.

    Returns:
    - DataFrame with normalized data.

    """
    data_norm = data.copy()
    # standarization
    dont_normalize = ['change_status_date0','change_status_date1','change_status_date2','change_status_date3','change_status_date4','urban_type_Dense Urban', 'urban_type_Industrial', 'urban_type_Rural',
       'urban_type_Sparse Urban', 'urban_type_Urban Slum','geography_type_Barren Land', 'geography_type_Coastal','geography_type_Dense Forest', 'geography_type_Desert',
       'geography_type_Farms', 'geography_type_Grass Land','geography_type_Hills', 'geography_type_Lakes', 'geography_type_River','geography_type_Snow', 
       'geography_type_Sparse Forest']
    all_columns = list(data_norm.columns)
    to_normalize = [x for x in all_columns if x not in dont_normalize]
    data_norm[to_normalize] = (data_norm[to_normalize] - data_norm[to_normalize].mean())/data_norm[to_normalize].std()
    return data_norm

In [21]:
def normalize_data_3(data):
    """
    Normalize the data using standardization.

    Parameters:
    - data: DataFrame containing the data.

    Returns:
    - DataFrame with normalized data.

    """
    data_norm = data.copy()
    all_columns = list(data_norm.columns)
    # to_normalize = [x for x in all_columns if not any(substring in x for substring in dont_normalize)]
    to_normalize = all_columns
    data_norm[to_normalize] = (data_norm[to_normalize] - data_norm[to_normalize].mean()) / data_norm[to_normalize].std()
    return data_norm

In [22]:
def preprocess_training(train_dataset):
    """
    Preprocess the training dataset.

    Parameters:
    - train_dataset: DataFrame containing the training dataset.

    Returns:
    - data_norm: DataFrame with preprocessed and normalized data.
    - y_data: Target variable.
    - data: DataFrame with preprocessed data.

    Steps:
    1. Replace 'N,A', None, 'nan', np.inf, and -np.inf with NaN.
    2. Drop rows with more than 4 NaN values and no missing date columns.
    3. Add date-related features.
    4. Map 'change_type' values to integers.
    5. Encode categorical columns using one-hot encoding.
    6. Drop unnecessary columns.
    7. Impute missing numerical values.
    8. Create synthetic samples using SMOTE.
    9. Add important features.
    10. Normalize the data.

    """
    data = train_dataset.copy().reset_index(drop=True)
    data = data.replace(['N,A', None, 'nan', np.inf, -np.inf], np.nan)

    change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4, 'Mega Projects': 5}

    date_columns = ['date0', 'date1', 'date2', 'date3', 'date4']

    nan_values = data.isna()
    nan_count_per_row = nan_values.sum(axis=1)

    index_maintain = data[(nan_count_per_row < 5) & (~data[date_columns].isna().any(axis=1))].index 
    data = data.iloc[index_maintain, :]

    data = add_date_features(data, date_columns)

    data['change_type'] = data['change_type'].apply(lambda x: change_type_map[x])

    columns_to_encode = ['urban_type', 'geography_type']
    for column in columns_to_encode:
        data = dumb_hot_encoder(data, column)
    data.drop(columns=['urban_type_nan', 'geography_type_nan'], inplace=True)

    try:
        data.drop(columns='Unnamed: 0', inplace=True)
    except:
        pass
    
    data.drop(columns=date_columns, inplace=True)
    data = numerical_imputer(data)

    data, y_data = create_data(data)

    data = add_important_features(data)

    data_norm = data.copy()
    data_norm = normalize_data_2(data_norm)

    return data_norm, y_data, data

In [23]:
def preprocessing_test(test_dataset):
    """
    Preprocess the test dataset.

    Parameters:
    - test_dataset: DataFrame containing the test dataset.

    Returns:
    - data_norm: DataFrame with preprocessed and normalized data.
    - data: DataFrame with preprocessed data.

    Steps:
    1. Replace 'N,A', None, 'nan', np.inf, and -np.inf with NaN.
    2. Fill missing values in date columns with the most frequent date.
    3. Fill missing values in change status columns with the median.
    4. Add date-related features.
    5. Encode categorical columns using one-hot encoding.
    6. Drop unnecessary columns.
    7. Impute missing numerical values.
    8. Add important features.
    9. Normalize the data.

    """
    data = test_dataset.copy().reset_index(drop=True)
    data = data.replace(['N,A', None, 'nan', np.inf, -np.inf], np.nan)

    date_columns = ['date0', 'date1', 'date2', 'date3', 'date4']

    data = data.fillna(pd.NaT).astype(object).fillna(np.nan)

    date_imputer = SimpleImputer(strategy='most_frequent')
    data[date_columns] = date_imputer.fit_transform(data[date_columns])

    status_imputer = SimpleImputer(strategy='median')
    change_status_columns = [f'change_status_date{i}' for i in range(0, 5)]
    data[change_status_columns] = status_imputer.fit_transform(data[change_status_columns])

    data = add_date_features(data, date_columns)

    columns_to_encode = ['urban_type', 'geography_type']
    for column in columns_to_encode:
        data = dumb_hot_encoder(data, column)
    data.drop(columns=['urban_type_nan', 'geography_type_nan'], inplace=True)

    try:
        data.drop(columns='Unnamed: 0', inplace=True)
    except:
        pass

    data.drop(columns=date_columns, inplace=True)
    data = numerical_imputer(data)    
    data = add_important_features(data)
    data_norm = data.copy()
    data_norm = normalize_data_2(data_norm)

    return data_norm, data

In [24]:
def find_important_features(X_data, y_data):
    """
    Find important features using a Random Forest classifier.

    Parameters:
    - X_data: DataFrame containing the features.
    - y_data: Series containing the target labels.

    Returns:
    - selected_feature_names: List of selected feature names.

    Steps:
    1. Initialize a Random Forest classifier.
    2. Initialize SelectFromModel with the Random Forest classifier and the desired threshold or number of features.
    3. Fit SelectFromModel to the data and transform the data.
    4. Get the indices of selected features.
    5. Get the names of selected features.
    6. Print the selected feature names.
    7. Return the list of selected feature names.

    """
    rf_classifier = rf()

    sfm = SelectFromModel(estimator=rf_classifier, threshold='median')

    X_selected = sfm.fit_transform(X_data, y_data)

    selected_mask = sfm.get_support()

    selected_indices = [i for i, selected in enumerate(selected_mask) if selected]

    selected_feature_names = [X_data.columns[i] for i in selected_indices]
    print("Selected features:", selected_feature_names)
    return selected_feature_names

In [25]:
def evaluate_model(model, X, y, type='Training'):
    """
    Evaluate the performance of a model.

    Parameters:
    - model: Trained classifier model.
    - X: DataFrame containing the features.
    - y: Series containing the target labels.
    - type: Type of evaluation (e.g., 'Training', 'Validation', 'Test').

    Steps:
    1. Print the evaluation type.
    2. Predict the labels using the model.
    3. Calculate F1 scores (macro, micro, weighted) and accuracy score.
    4. Print the F1 scores and accuracy score.

    """
    print(f"==================={type}===============")
    pred_y = model.predict(X)
    score_macro = f1_score(y, pred_y, average='macro')
    score_micro = f1_score(y, pred_y, average='micro')
    score_weighted = f1_score(y, pred_y, average='weighted')
    score = model.score(X, y)
    print(f"-> score macro: {score_macro}")
    print("----------------------------")
    print(f"-> score micro: {score_micro}")
    print(f"-> score weighted: {score_weighted}")
    print(f"-> score      : {score}")

### Importing Data

In [None]:
train_df_original = gpd.read_file('../data/train.geojson', index_col=0)
test_df_original = gpd.read_file('../data/test.geojson', index_col=0)

### Data Processing

#### Creating copies

In [None]:
train_df = train_df_original.copy()
test_df = test_df_original.copy()

train_df = train_df.replace({'N,A':np.nan, None: np.nan})
train_df.reset_index(drop=True, inplace=True)
data = prepare_data(train_df)
test_df = prepare_data(test_df)

#### Fixing columns

In [None]:
data_filtered = filter_train_data(data)
data_fix_columns = fix_column_names(data_filtered)
data_fix_columns_test = fix_column_names(test_df)

In [None]:
change_status_columns = [f'change_status_date{i}' for i in range(0, 5)]
data_fix_status = fix_change_status(data_fix_columns,change_status_columns)
data_fix_status_test = fix_change_status(data_fix_columns_test,change_status_columns)

In [None]:
data_fix_date = fix_date_columns(data_fix_status,change_status_columns)
data_fix_date_test = fix_date_columns(data_fix_status_test,change_status_columns,'_test')

#### Importing data with fixed columns

In [26]:
train_df = pd.read_csv('../data/data_fixed_type.csv', index_col=0)
test_df = pd.read_csv('../data/data_fixed_type_test.csv', index_col=0)

#### Processing data

In [27]:
X_data, y_data, X_data_pre_norm = preprocess_training(train_df)
X_submit, X_submit_pre_norm = preprocessing_test(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_col_not_nan.drop(columns=[col], inplace=True)
  df_col_not_nan.replace({True: 1, False: 0}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_col_not_nan.replace({True: 1, False: 0}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_col_not_nan.drop(columns=np.nan, errors='ignore', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

KeyboardInterrupt: 

In [570]:
X_data_pre_norm = pd.read_csv('X_data_pre_norm_2.csv')
X_submit_pre_norm = pd.read_csv('X_submit_pre_norm_2.csv')

In [571]:
X_data = normalize_data_2(X_data_pre_norm)
X_submit = normalize_data_2(X_submit_pre_norm)

In [511]:
X_data.to_csv('X_data_2.csv')
X_submit.to_csv('X_submit_2.csv')
y_data.to_csv('y_data_2.csv')
X_data_pre_norm.to_csv('X_data_pre_norm_2.csv')
X_submit_pre_norm.to_csv('X_submit_pre_norm_2.csv')

### Selecting important features

In [None]:
selected_feature_names = find_important_features(X_data, y_data)

### Training

In [585]:
test_size = 0.2
X_data = X_data.reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=test_size)

In [586]:
print('----------------RandomForest')

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

----------------RandomForest


In [574]:
with open('random_forest_model_2.pkl', 'wb') as file:
    pkl.dump(rf, file)

In [None]:
evaluate_model(rf, X_train, y_train)

evaluate_model(rf, X_test, y_test, type='Test')

y_pred = rf.predict(X_test)

In [469]:
#Change order of columns
X_submission = X_submit[X_data.columns]
# X_submission = X_submission

pred_y = rf.predict(X_submission)
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("new_features_submission.csv", index=True, index_label='Id')