<h1><b>[IV - Feature extraction 4]</b></h1>

This notebook extracts the more advanced features `neighb_maxdiff_meancolor_{iDate}` or `neighb_maxdiff_stdcolor_{iDate}`.

It requires several hours to compute. Thus, the computed versions are also already available:

=> <b>it is not mandatory to run this notebook</b> (but it is provided for reproductibility).

Once computed, the features are saved in `neighb_maxdiff_meancolor_{iDate}.csv` or `neighb_maxdiff_stdcolor_{iDate}.csv`

____________________________________

Please choose mean or std, and iDate between 0 and 4 included

(To recompute everything, you'll have to run this notebook with all the possible values (here, 2*5 = 10))

In [1]:
method = "mean" # or method = "std"

In [None]:
iDate = 0 #between 0 and 4 included

__________________________________________

In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  

In [3]:
## Read csvs
import time
start = time.time()

train_df_geojson = gpd.read_file('data/train.geojson', engine="pyogrio")
test_df_geojson = gpd.read_file('data/test.geojson', engine="pyogrio")

end = time.time()
print("Reading time:", end - start,"s")

Reading time: 83.1383125782013 s


In [4]:
train_df = train_df_geojson.copy()
test_df = test_df_geojson.copy()

In [5]:


# renaming all dates related columns
for df in [train_df, test_df]:
    for i in range(5):
        df[f"date{i}"] = pd.to_datetime(df[f"date{i}"], format="%d-%m-%Y")
        df.rename(columns={f"date{i}": f"date{i}_old",
                           f"img_red_mean_date{i+1}": f"img_red_mean_date{i}_old",
                           f"img_green_mean_date{i+1}": f"img_green_mean_date{i}_old",
                           f"img_blue_mean_date{i+1}": f"img_blue_mean_date{i}_old",
                           f"img_red_std_date{i+1}": f"img_red_std_date{i}_old",
                           f"img_green_std_date{i+1}": f"img_green_std_date{i}_old",
                           f"img_blue_std_date{i+1}": f"img_blue_std_date{i}_old",
                           f"change_status_date{i}": f"change_status_date{i}_old"}, inplace=True)



In [6]:
# Toutes les colonnes qu'on doit réorganiser
col_prefixes = ["img_red_mean_date", "img_green_mean_date", "img_blue_mean_date", "img_red_std_date", "img_green_std_date", "img_blue_std_date", "change_status_date", "date"]

for df in [train_df, test_df]:
    # df avec les dates
    dftemp=df[[f"date{i}_old" for i in range(5)]].replace(np.nan, pd.to_datetime("01-01-1970", format="%d-%m-%Y"))
    
    # Tableau numpy tq new_to_old[iLigne, nouvelleDate] = ancienneDate
    new_to_old = np.argsort(dftemp.values, axis=1)

    for col_prefix in col_prefixes:
        for iNewDate in range(5):
            # Ajout d'une nouvelle ligne
            newColName = f"{col_prefix}{iNewDate}"
            df.insert(len(df.columns), newColName, None)
            for iOldDate in range(5):
                oldColName = f"{col_prefix}{iOldDate}_old"
                # Pour chaque ancienne date correspondante, on met à jour
                df.loc[(new_to_old[df.index, iNewDate] == iOldDate), newColName] = df.loc[(new_to_old[df.index, iNewDate] == iOldDate), oldColName]

In [7]:


# Converting back to float / datetime
col_imgs = ["img_red_mean_date", "img_green_mean_date", "img_blue_mean_date", "img_red_std_date", "img_green_std_date", "img_blue_std_date"]
for df in [train_df, test_df]:
    for col in col_imgs:
        for i in range(5):
            col_name = f"{col}{i}"
            df[col_name] = df[col_name].astype(float)
    for iDate in range(5):
        col_name = f'date{iDate}'
        df[col_name] = pd.to_datetime(df[col_name])



In [8]:


train_long = np.array(train_df["geometry"].centroid.x)
train_lat = np.array(train_df["geometry"].centroid.y)
train_coords = np.stack((train_long, train_lat), axis=1)

test_long = np.array(test_df["geometry"].centroid.x)
test_lat = np.array(test_df["geometry"].centroid.y)
test_coords = np.stack((test_long, test_lat), axis=1)



In [9]:
all_coords = np.concatenate([train_coords, test_coords])
print(all_coords.shape)

(416672, 2)


In [10]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=12, algorithm='ball_tree').fit(all_coords)
distances, indices = model.radius_neighbors(all_coords, radius=0.003)

In [11]:
distances[0]

array([0.00126759, 0.00126759, 0.00257037, 0.00257037, 0.        ,
       0.        ])

In [12]:
indices[0]

array([     1, 303482, 303483,      2, 303481,      0])

In [13]:
dfALL = pd.concat([train_df, test_df], axis=0)

In [14]:
dfALL.reset_index(drop=True, inplace=True)

In [17]:
change_status_map = {
    "Prior Construction": 0,
    "Greenland": 1,
    "Land Cleared": 2,
    "Excavation": 3,
    "Materials Dumped": 4,
    "Materials Introduced": 5,
    "Construction Started": 6,
    "Construction Midway": 7,
    "Construction Done": 8,
    "Operational": 9,
}

In [18]:
NB_TRAINS = len(train_df)

In [22]:
indices[0]

array([     1, 303482, 303483,      2, 303481,      0])

In [23]:
def calculate_max_difference(i, iDate):
    return (
        dfALL.iloc[indices[i]][f"img_red_{method}_date{iDate}"] - dfALL.iloc[i][f"img_red_{method}_date{iDate}"] +
        dfALL.iloc[indices[i]][f"img_green_{method}_date{iDate}"] - dfALL.iloc[i][f"img_green_{method}_date{iDate}"] +
        dfALL.iloc[indices[i]][f"img_blue_{method}_date{iDate}"] - dfALL.iloc[i][f"img_blue_{method}_date{iDate}"]
    ).abs().max()

# Define the date variable

# Create the new column using apply() function
result = pd.DataFrame(dfALL.index.map(lambda i: calculate_max_difference(i, iDate)), columns=[f"neighb_maxdiff_{method}color_{iDate}"])

In [24]:
result

Unnamed: 0,neighb_maxdiff_meancolor_3
0,19.854632
1,16.455969
2,31.416012
3,34.987111
4,120.946772
...,...
416667,221.480656
416668,247.725578
416669,332.810042
416670,309.940239


In [25]:
result.to_csv(f"data/neighb_maxdiff_{method}color_{iDate}.csv", index=False)