In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.neighbors import KNeighborsClassifier

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4, 'Mega Projects': 5}

In [2]:
# Read csvs
train_df = gpd.read_file('data/train.geojson', index_col=0)

## Analyzing Data

In [3]:
# Count number of each change type
change_type_counts = train_df['change_type'].value_counts()
print(change_type_counts)

change_type
Residential      148435
Commercial       100422
Demolition        31509
Road              14305
Industrial         1324
Mega Projects       151
Name: count, dtype: int64


In [4]:
# Count number of missing values in each column
missing_values = train_df.isnull().sum()
print(missing_values)

urban_type                 0
geography_type             0
change_type                0
img_red_mean_date1      1954
img_green_mean_date1    1954
img_blue_mean_date1     1954
img_red_std_date1       1954
img_green_std_date1     1954
img_blue_std_date1      1954
img_red_mean_date2      1954
img_green_mean_date2    1954
img_blue_mean_date2     1954
img_red_std_date2       1954
img_green_std_date2     1954
img_blue_std_date2      1954
img_red_mean_date3      1954
img_green_mean_date3    1954
img_blue_mean_date3     1954
img_red_std_date3       1954
img_green_std_date3     1954
img_blue_std_date3      1954
img_red_mean_date4      1954
img_green_mean_date4    1954
img_blue_mean_date4     1954
img_red_std_date4       1954
img_green_std_date4     1954
img_blue_std_date4      1954
img_red_mean_date5      1954
img_green_mean_date5    1954
img_blue_mean_date5     1954
img_red_std_date5       1954
img_green_std_date5     1954
img_blue_std_date5      1954
date0                   1383
change_status_

In [5]:
# Count number of missing values in each column for each change type
none_counts = train_df.groupby('change_type').apply(lambda x: x.isnull().sum())
print(none_counts)


               urban_type  geography_type  change_type  img_red_mean_date1   
change_type                                                                  
Commercial              0               0            0                 509  \
Demolition              0               0            0                 242   
Industrial              0               0            0                   1   
Mega Projects           0               0            0                   0   
Residential             0               0            0                1163   
Road                    0               0            0                  39   

               img_green_mean_date1  img_blue_mean_date1  img_red_std_date1   
change_type                                                                   
Commercial                      509                  509                509  \
Demolition                      242                  242                242   
Industrial                        1                    1   

In [6]:
# Count number of missing values in each column for each change type as a percentage
none_counts = train_df.groupby('change_type').apply(lambda x: x.isnull().sum() / len(x))
print(none_counts)

               urban_type  geography_type  change_type  img_red_mean_date1   
change_type                                                                  
Commercial            0.0             0.0          0.0            0.005069  \
Demolition            0.0             0.0          0.0            0.007680   
Industrial            0.0             0.0          0.0            0.000755   
Mega Projects         0.0             0.0          0.0            0.000000   
Residential           0.0             0.0          0.0            0.007835   
Road                  0.0             0.0          0.0            0.002726   

               img_green_mean_date1  img_blue_mean_date1  img_red_std_date1   
change_type                                                                   
Commercial                 0.005069             0.005069           0.005069  \
Demolition                 0.007680             0.007680           0.007680   
Industrial                 0.000755             0.000755   

The ratio is very light, it is possible to delete the lines with missing data.

## Dates and Colors Filtering

### Ordering and computing time intervals

In [7]:
# Create the table of train data
train_dates = train_df[[f'date{i}' for i in range(0, 5)]]
print(train_dates)

             date0       date1       date2       date3       date4
0       01-08-2018  09-12-2013  10-09-2016  22-07-2019  24-07-2017
1       01-08-2018  09-12-2013  10-09-2016  22-07-2019  24-07-2017
2       01-08-2018  09-12-2013  10-09-2016  22-07-2019  24-07-2017
3       01-08-2018  09-12-2013  10-09-2016  22-07-2019  24-07-2017
4       01-08-2018  09-12-2013  10-09-2016  22-07-2019  24-07-2017
...            ...         ...         ...         ...         ...
296141  19-11-2014  25-02-2017  27-01-2014  28-03-2018  28-12-2015
296142  19-11-2014  25-02-2017  27-01-2014  28-03-2018  28-12-2015
296143  19-11-2014  25-02-2017  27-01-2014  28-03-2018  28-12-2015
296144  19-11-2014  25-02-2017  27-01-2014  28-03-2018  28-12-2015
296145  19-11-2014  25-02-2017  27-01-2014  28-03-2018  28-12-2015

[296146 rows x 5 columns]


In [8]:
# Calculating arsort indexes of dates
def index_sort_by_dates(date_row):
    return np.argsort(pd.to_datetime(date_row, format='%d-%m-%Y'))

# Test index_sort_by_dates
print(index_sort_by_dates(['18-09-2010', '18-09-2009', '18-09-2000']))

[2 1 0]


In [9]:
# Create the table of arsort indexes of dates
train_index = train_dates[[f'date{i}' for i in range(0, 5)]].apply(index_sort_by_dates, axis=1)
print(train_index)

        date0  date1  date2  date3  date4
0           1      2      4      0      3
1           1      2      4      0      3
2           1      2      4      0      3
3           1      2      4      0      3
4           1      2      4      0      3
...       ...    ...    ...    ...    ...
296141      2      0      4      1      3
296142      2      0      4      1      3
296143      2      0      4      1      3
296144      2      0      4      1      3
296145      2      0      4      1      3

[296146 rows x 5 columns]


### Times Intervals

In [26]:
def time_interval(row):
    date_row = row[:5]
    index_row = row[5:]
    return [int((pd.to_datetime(date_row[index_row[i + 1]], format='%d-%m-%Y') - pd.to_datetime(date_row[index_row[i]], format='%d-%m-%Y')).days) if date_row[index_row[i]] is not None and date_row[index_row[i + 1]] is not None else None for i in range(4)]

# Test time_interval
print(time_interval(['18-09-2020', '18-09-2009', '18-09-2010', '18-09-2011', '18-09-2012', 1, 2, 3, 4, 0]))

[365, 365, 366, 2922]


In [31]:
# Create the table of time intervals
train_intervals = pd.concat([train_dates, train_index], axis=1).apply(time_interval, axis=1)
train_intervals = pd.DataFrame(train_intervals.tolist(), columns=['time_interval1', 'time_interval2', 'time_interval3', 'time_interval4'])
print(train_intervals)

        time_interval1  time_interval2  time_interval3  time_interval4
0               1006.0           317.0           373.0           355.0
1               1006.0           317.0           373.0           355.0
2               1006.0           317.0           373.0           355.0
3               1006.0           317.0           373.0           355.0
4               1006.0           317.0           373.0           355.0
...                ...             ...             ...             ...
296141           296.0           404.0           425.0           396.0
296142           296.0           404.0           425.0           396.0
296143           296.0           404.0           425.0           396.0
296144           296.0           404.0           425.0           396.0
296145           296.0           404.0           425.0           396.0

[296146 rows x 4 columns]


In [33]:
# Count number of None
none_counts = train_intervals.isnull().sum()
print(none_counts)

time_interval1    1383
time_interval2    1383
time_interval3    1448
time_interval4    1448
dtype: int64


In [34]:
# Replace None with mean
train_intervals = train_intervals.fillna(train_intervals.mean())
none_counts = train_intervals.isnull().sum()
print(none_counts)

time_interval1    0
time_interval2    0
time_interval3    0
time_interval4    0
dtype: int64


In [36]:
# Creating array
train_intervals = np.asarray(train_intervals).reshape(-1, 4)
print(train_intervals.shape)

# Normalizing the data
if train_intervals.std(axis=0).any():
    train_intervals = (train_intervals - train_intervals.mean(axis=0)) / train_intervals.std(axis=0)
else:
    train_intervals = (train_intervals - train_intervals.mean(axis=0))

print(train_intervals)

(296146, 4)
[[ 3.05075593 -0.99085077 -0.58895366 -0.61472004]
 [ 3.05075593 -0.99085077 -0.58895366 -0.61472004]
 [ 3.05075593 -0.99085077 -0.58895366 -0.61472004]
 ...
 [-0.86639699 -0.42730469 -0.28552383 -0.36099474]
 [-0.86639699 -0.42730469 -0.28552383 -0.36099474]
 [-0.86639699 -0.42730469 -0.28552383 -0.36099474]]


### Red Mean

In [37]:
# Define the get_ordered_values function
def get_ordered_values(row):
    data = row[:5]
    indexes = row[5:]
    return [data[int(indexes[i])] for i in range(5)]

# Test get_ordered_values
print(get_ordered_values([1, 2, 3, 4, 5, 4, 3, 2, 1, 0]))

[5, 4, 3, 2, 1]


In [38]:
# Red Mean
train_red_mean = pd.concat([train_df[[f'img_red_mean_date{i}' for i in range(1, 6)]], train_index], axis=1).apply(get_ordered_values, axis=1)
train_red_mean = pd.DataFrame(train_red_mean.tolist(), columns=[f'img_red_mean_date{i}' for i in range(1, 6)])
print(train_red_mean)

        img_red_mean_date1  img_red_mean_date2  img_red_mean_date3   
0               125.773062          150.766726          104.614233  \
1               133.097679          184.480155          110.445556   
2               120.713490          148.150431           98.734466   
3               114.819776          148.322747           90.506679   
4               141.514462          170.365008          155.293576   
...                    ...                 ...                 ...   
296141          211.421955          239.297084          186.380103   
296142          196.476812          162.912319          170.780797   
296143           68.845906          111.304320          108.650548   
296144           98.718266          137.374613          157.947368   
296145          104.533730          176.841270          176.222222   

        img_red_mean_date4  img_red_mean_date5  
0                93.371775           92.291347  
1                96.071674          111.001421  
2           

In [None]:
# Replace None with mean
train_red_mean = train_red_mean.fillna(train_red_mean.mean())
none_counts = train_red_mean.isnull().sum()
print(none_counts)

### Normalize data

In [15]:
# Normalize data
train_red_mean = (train_red_mean - train_red_mean.mean()) / train_red_mean.std()
print(train_red_mean)

        img_red_mean_date1  img_red_mean_date2  img_red_mean_date3   
0                 0.210286            0.797181           -0.432982  \
1                 0.398533            1.646722           -0.287877   
2                 0.080252            0.731253           -0.579292   
3                -0.071220            0.735595           -0.784030   
4                 0.614850            1.291036            0.828107   
...                    ...                 ...                 ...   
296141            2.411516            3.028049            1.601654   
296142            2.027417            1.103237            1.213486   
296143           -1.252778           -0.197228           -0.332543   
296144           -0.485039            0.459714            0.894143   
296145           -0.335578            1.454231            1.348888   

        img_red_mean_date4  img_red_mean_date5  
0                -0.646043           -0.680883  
1                -0.580825           -0.208969  
2           

### Red-Green-Blue Means and Svd

In [16]:
config = {'red_mean', 'blue_mean', 'green_mean', 'red_std', 'blue_std', 'green_std'}
train_colors = pd.DataFrame()

for element in config:
    train_element = pd.concat([train_df[[f'img_{element}_date{i}' for i in range(1, 6)]], train_index], axis=1).apply(get_ordered_values, axis=1)
    train_element = pd.DataFrame(train_element.tolist(), columns=[f'img_{element}_date{i}' for i in range(1, 6)])
    train_element = train_element.fillna(train_element.mean())
    train_element = (train_element - train_element.mean()) / train_element.std()
    train_colors = pd.concat([train_colors, train_element], axis=1)

print(train_colors)

        img_green_std_date1  img_green_std_date2  img_green_std_date3   
0                  0.122145             1.546239             0.422036  \
1                 -0.341687             0.877316             0.435797   
2                 -0.083731             1.583832             0.456068   
3                  0.100980             1.324257             0.390201   
4                  0.545223             1.068349             0.769790   
...                     ...                  ...                  ...   
296141            -0.332977             0.412993             0.292753   
296142             1.268435             1.305857             1.978747   
296143            -0.929067            -0.565057            -0.039399   
296144             0.004296             0.306057             1.968091   
296145             0.001648             0.948938             1.094236   

        img_green_std_date4  img_green_std_date5  img_red_std_date1   
0                  0.150581             0.394846    

In [17]:
# Creating array
train_colors = np.asarray(train_colors).reshape(-1, 30)
print(train_colors.shape)

(296146, 30)


## Hot-one Encoding for Urban and Geographical data

### Change Status

In [41]:
# Get change_status by ascending dates
train_change_status = pd.concat([train_df[[f'change_status_date{i}' for i in range(0, 5)]], train_index], axis=1).apply(get_ordered_values, axis=1)
print(train_change_status)

(296146,)


## Area Filtering

### Computing the data

In [18]:
# Area of the building
train_area = np.asarray(train_df[['geometry']].area)


  train_area = np.asarray(train_df[['geometry']].area)


In [19]:
def calculate_convex_hull_area(geometry):
    try:
        convex_hull = geometry.convex_hull
        return convex_hull.area
    except:
        return 0

# Calculate the area of the convex hull for each geometry
train_convex_area = train_df['geometry'].apply(calculate_convex_hull_area)

# Convert to numpy array
train_convex_area = np.asarray(train_convex_area)

In [20]:
# Convex hull ratio
train_convex_ratio = train_area / train_convex_area
train_convex_ratio = train_convex_ratio.reshape(-1, 1)

### Analyze the area parameters

In [21]:
# Number of non convex areas
non_convex_areas_n = np.sum(train_convex_ratio < 0.99)
non_convex_areas_ratio = non_convex_areas_n / len(train_convex_ratio)
print('Number of non convex areas: ' + str(non_convex_areas_n))
print('Ratio of non convex areas of ' + str(non_convex_areas_ratio*100) + '%.')
mean_area = np.mean(train_area)

# Mean and Standard deviation of areas
std_area = np.std(train_area)
print('Mean area: ' + str(mean_area) + ' and std area: ' + str(std_area))

Number of non convex areas: 42369
Ratio of non convex areas of 14.306794621571791%.
Mean area: 5.792798723701533e-07 and std area: 6.544386914765251e-05


Coherent ! a road takes less place than a residential area.

## Stacking all the features

In [22]:
# Normalize area
train_area = (train_area - train_area.mean()) / train_area.std()
train_convex_ratio = (train_convex_ratio - train_convex_ratio.mean()) / train_convex_ratio.std()

# Stacking the features
train_area_features = np.hstack([train_area.reshape(-1, 1), train_convex_ratio])
print(train_area_features)

[[ 0.00363946  0.28623304]
 [-0.00213689  0.28623304]
 [ 0.00369309  0.28623304]
 ...
 [-0.00868493  0.28623304]
 [-0.00882395  0.28623304]
 [-0.00881304  0.28623304]]


## Concatenate all the tables and PCA

### Concatenation

In [23]:
# Concatenating the features
train_features = np.hstack([train_area_features, train_colors, train_intervals])
print(train_features.shape)

(296146, 36)


### Applying PCA

In [25]:
# Applying PCA
W = np.dot(train_features.T, train_features) # compute covariance matrix
eigval, eigvec = np.linalg.eig(W) # compute eigenvalues and eigenvectors of covariance matrix

# Sort eigenvalues and eigenvectors
idx = eigval.argsort()[::-1] # Sort eigenvalues
eigvec = eigvec[:,idx] # Sort eigenvectors according to eigenvalues
eigval = eigval[idx] # Sort eigenvalues

print(np.sum(eigval[:1]) / np.sum(eigval))

LinAlgError: Array must not contain infs or NaNs

### Select the best k to achieve 85% 

In [None]:
k = 1
while k < len(eigval) and (np.sum(eigval[:k])/np.sum(eigval)) < 0.85:   
    k += 1

print('k=' + str(k), '; percent=' + str(int(np.sum(eigval[:k]) / np.sum(eigval)*100)) + '%.')

k=12 ; percent=86%.


In [None]:
# Selecting the first k principal components
train_features = np.dot(train_features, eigvec[:,:k])
print(train_features.shape)
print(train_features)

(292758, 12)
[[-1.34347304 -1.74579613  0.66557092 ... -0.88468115  2.68653112
   0.21884086]
 [-1.48549065 -0.05429729  0.71451344 ... -0.22095987  2.9346425
   0.41605621]
 [-1.19865268 -1.68989846  0.8564808  ... -1.05744206  2.43823516
   0.11941088]
 ...
 [ 5.13592389 -0.51058606  0.07969247 ... -0.13419465 -0.14683372
  -0.10813903]
 [-1.35910374 -0.82818722 -0.00697445 ...  2.56089568 -0.75172278
   0.16893018]
 [-2.49919984  0.36370635  0.29944503 ...  2.22705814 -0.76826445
   0.16163682]]


## Save Train Data

In [None]:
# Save the train features
np.save('preprocessed_data.npy', train_features)

In [None]:
# Save the train labels
train_y = train_df['change_type'].apply(lambda x: change_type_map[x])
print(train_y)

np.save('preprocessed_labels.npy', train_df['change_type'].map(change_type_map).values)

0         1
1         1
2         1
3         1
4         0
         ..
296141    3
296142    2
296143    2
296144    2
296145    2
Name: change_type, Length: 292758, dtype: int64


## Save Test Data

In [None]:
# Adapt the test data to the PCA model
test_df = gpd.read_file('test.geojson', index_col=0)