<h1><b>[III - Feature extraction 3]</b></h1>

This notebook extracts the more advanced features `neighb_maxdiff_change_status_date0_0.001` or `neighb_maxdiff_change_status_date0_0.003`.

It requires several hours to compute. Thus, the computed versions are also already available:

=> <b>it is not mandatory to run this notebook</b> (but it is provided for reproductibility).

Once computed, the features are saved in `diffs_change_status0.001.npy` or `diffs_change_status0.003.npy`

______________________

Please choose either 0.001 or 0.003

(to compute everything again, you should run two times this notebook, with each of these values)

In [None]:
radius = 0.001

______________________

In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  

In [3]:
## Read csvs
import time
start = time.time()

train_df_geojson = gpd.read_file('data/train.geojson', engine="pyogrio")
test_df_geojson = gpd.read_file('data/test.geojson', engine="pyogrio")

end = time.time()
print("Reading time:", end - start,"s")

Reading time: 100.24508786201477 s


In [4]:
train_df = train_df_geojson.copy()
test_df = test_df_geojson.copy()

In [5]:


# renaming all dates related columns
for df in [train_df, test_df]:
    for i in range(5):
        df[f"date{i}"] = pd.to_datetime(df[f"date{i}"], format="%d-%m-%Y")
        df.rename(columns={f"date{i}": f"date{i}_old",
                           f"img_red_mean_date{i+1}": f"img_red_mean_date{i}_old",
                           f"img_green_mean_date{i+1}": f"img_green_mean_date{i}_old",
                           f"img_blue_mean_date{i+1}": f"img_blue_mean_date{i}_old",
                           f"img_red_std_date{i+1}": f"img_red_std_date{i}_old",
                           f"img_green_std_date{i+1}": f"img_green_std_date{i}_old",
                           f"img_blue_std_date{i+1}": f"img_blue_std_date{i}_old",
                           f"change_status_date{i}": f"change_status_date{i}_old"}, inplace=True)



In [6]:
# Toutes les colonnes qu'on doit réorganiser
col_prefixes = ["img_red_mean_date", "img_green_mean_date", "img_blue_mean_date", "img_red_std_date", "img_green_std_date", "img_blue_std_date", "change_status_date", "date"]

for df in [train_df, test_df]:
    # df avec les dates
    dftemp=df[[f"date{i}_old" for i in range(5)]].replace(np.nan, pd.to_datetime("01-01-1970", format="%d-%m-%Y"))
    
    # Tableau numpy tq new_to_old[iLigne, nouvelleDate] = ancienneDate
    new_to_old = np.argsort(dftemp.values, axis=1)

    for col_prefix in col_prefixes:
        for iNewDate in range(5):
            # Ajout d'une nouvelle ligne
            newColName = f"{col_prefix}{iNewDate}"
            df.insert(len(df.columns), newColName, None)
            for iOldDate in range(5):
                oldColName = f"{col_prefix}{iOldDate}_old"
                # Pour chaque ancienne date correspondante, on met à jour
                df.loc[(new_to_old[df.index, iNewDate] == iOldDate), newColName] = df.loc[(new_to_old[df.index, iNewDate] == iOldDate), oldColName]

In [7]:


# Converting back to float / datetime
col_imgs = ["img_red_mean_date", "img_green_mean_date", "img_blue_mean_date", "img_red_std_date", "img_green_std_date", "img_blue_std_date"]
for df in [train_df, test_df]:
    for col in col_imgs:
        for i in range(5):
            col_name = f"{col}{i}"
            df[col_name] = df[col_name].astype(float)
    for iDate in range(5):
        col_name = f'date{iDate}'
        df[col_name] = pd.to_datetime(df[col_name])



In [8]:


train_long = np.array(train_df["geometry"].centroid.x)
train_lat = np.array(train_df["geometry"].centroid.y)
train_coords = np.stack((train_long, train_lat), axis=1)

test_long = np.array(test_df["geometry"].centroid.x)
test_lat = np.array(test_df["geometry"].centroid.y)
test_coords = np.stack((test_long, test_lat), axis=1)



In [9]:
all_coords = np.concatenate([train_coords, test_coords])
print(all_coords.shape)

(416672, 2)


In [10]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=100, algorithm='ball_tree').fit(all_coords)
distances, indices = model.kneighbors(all_coords)

In [11]:
dfALL = pd.concat([train_df, test_df], axis=0)

In [14]:
indices[3]

array([303484,      3, 303486,      5,      2, 303483, 303488,      7,
       303482,      1, 303485,      4,      8, 303489, 303481,      0,
       303487,      6, 303497,     16,     15, 303496, 303655,    174,
           14, 303495,    173, 303654,    175, 303656,     13, 303494,
           12, 303493, 303490,      9,     19, 303500, 303492,     11,
       303491,     10,    172, 303653,    176, 303657, 303507,     26,
       303504,     23,     20, 303501, 303498,     17, 303499,     18,
          167, 303648, 303508,     27,    170, 303651, 303505,     24,
           21, 303502,     25, 303506,     22, 303503,    171, 303652,
       303645,    164, 303647,    166, 303646,    165,     28, 303509,
       303511,     30, 303667,    186,    163, 303644,     31, 303512,
       303649,    168,     33, 303514, 303658,    177,    178, 303659,
          162, 303643, 303650,    169])

In [15]:
distances[3]

array([0.        , 0.        , 0.00146736, 0.00146736, 0.00169755,
       0.00169755, 0.00276137, 0.00276137, 0.00300037, 0.00300037,
       0.00356845, 0.00356845, 0.00424826, 0.00424826, 0.00426781,
       0.00426781, 0.00447907, 0.00447907, 0.0054329 , 0.0054329 ,
       0.0055349 , 0.0055349 , 0.00598076, 0.00598076, 0.00609144,
       0.00609144, 0.00663422, 0.00663422, 0.00699169, 0.00699169,
       0.00699795, 0.00699795, 0.00700343, 0.00700343, 0.00718498,
       0.00718498, 0.00754068, 0.00754068, 0.0076417 , 0.0076417 ,
       0.00788595, 0.00788595, 0.00797697, 0.00797697, 0.00837028,
       0.00837028, 0.00870424, 0.00870424, 0.00876148, 0.00876148,
       0.00876174, 0.00876174, 0.00881964, 0.00881964, 0.00888173,
       0.00888173, 0.00903786, 0.00903786, 0.00904211, 0.00904211,
       0.00913185, 0.00913185, 0.00913297, 0.00913297, 0.00921176,
       0.00921176, 0.00922659, 0.00922659, 0.00933259, 0.00933259,
       0.00933311, 0.00933311, 0.00955275, 0.00955275, 0.00959

In [16]:
change_status_map = {
    "Prior Construction": 0,
    "Greenland": 1,
    "Land Cleared": 2,
    "Excavation": 3,
    "Materials Dumped": 4,
    "Materials Introduced": 5,
    "Construction Started": 6,
    "Construction Midway": 7,
    "Construction Done": 8,
    "Operational": 9,
}

In [17]:
NB_TRAINS = len(train_df)

In [18]:
diffs_change_status = [[] for i in range(5)]
for iActuel in range(len(indices)):
    for iDate in range(5):
        maxDiff = 0
        diffActuel = 0
        nbVisited = 0
        for indice, dist in zip(indices[iActuel], distances[iActuel]):
            nbVisited += 1
            if dist >= radius or nbVisited > 5:
                break
            if dfALL.iloc[iActuel][f"change_status_date{iDate}"] not in change_status_map or dfALL.iloc[indice][f"change_status_date{iDate}"] not in change_status_map:
                continue
            diff = change_status_map[dfALL.iloc[iActuel][f"change_status_date{iDate}"]] - change_status_map[dfALL.iloc[indice][f"change_status_date{iDate}"]]
            if abs(diff) > maxDiff:
                maxDiff = abs(diff)
                diffActuel = diff
        diffs_change_status[iDate].append(diff)
    if iActuel%1000 == 0:
        print(iActuel,"/", len(indices))

0 / 416672
1000 / 416672
2000 / 416672
3000 / 416672
4000 / 416672
5000 / 416672
6000 / 416672
7000 / 416672
8000 / 416672
9000 / 416672
10000 / 416672
11000 / 416672
12000 / 416672
13000 / 416672
14000 / 416672
15000 / 416672
16000 / 416672
17000 / 416672
18000 / 416672
19000 / 416672
20000 / 416672
21000 / 416672
22000 / 416672
23000 / 416672
24000 / 416672
25000 / 416672
26000 / 416672
27000 / 416672
28000 / 416672
29000 / 416672
30000 / 416672
31000 / 416672
32000 / 416672
33000 / 416672
34000 / 416672
35000 / 416672
36000 / 416672
37000 / 416672
38000 / 416672
39000 / 416672
40000 / 416672
41000 / 416672
42000 / 416672
43000 / 416672
44000 / 416672
45000 / 416672
46000 / 416672
47000 / 416672
48000 / 416672
49000 / 416672
50000 / 416672
51000 / 416672
52000 / 416672
53000 / 416672
54000 / 416672
55000 / 416672
56000 / 416672
57000 / 416672
58000 / 416672
59000 / 416672
60000 / 416672
61000 / 416672
62000 / 416672
63000 / 416672
64000 / 416672
65000 / 416672
66000 / 416672
67000 / 

In [19]:
if radius == 0.001:
    np.save("data/diffs_change_status0.001", diffs_change_status)
else:
    np.save("data/diffs_change_status0.003", diffs_change_status)