In [1]:
import pandas as pd
from numpy import array, hstack

In [2]:
data = pd.read_csv("eq_1960_2022.csv")

In [3]:
data.head()

Unnamed: 0,time,latitude,longitude,depth,mag
0,2021-12-01,35.228,141.4204,10.0,4.7
1,2021-12-01,35.2774,141.5336,10.0,4.7
2,2021-12-01,33.769,137.1036,362.75,4.0
3,2021-12-01,35.2014,141.4165,10.0,4.5
4,2021-12-01,35.143,141.3281,10.0,4.9


## Data Preprocessing

### Division en zones

On récupère le code précèdent

In [4]:
from collections import namedtuple

Coords = namedtuple("Coords", "lat_min lat_max lon_min lon_max")

In [5]:
zones = {"A": Coords(41, 46, 140, 146), "B": Coords(37, 41, 138, 142), "C": Coords(34, 37, 137, 141), "D": Coords(33, 37, 133, 137), "E": Coords(31, 36, 129, 133), "F": Coords(38, 41, 142, 144), "G": Coords(36, 38, 141, 143)}

def to_5_zone(lat: float, lon: float):
    """Convert coordinates to a zone number according to the map arrangement above"""
    for zone, coord in zones.items():
        if (coord.lat_min <= lat <= coord.lat_max) & (coord.lon_min <= lon <= coord.lon_max):
            return zone
    return 0


In [6]:
data["zone"] = data.apply(lambda x: to_5_zone(x['latitude'], x['longitude']), axis=1)

Pour simplifier, nous utiliserons 5 zones découpées ainsi : trois zones en zone verte et 2 en zone bleues (cf code d'Ibrahima).  
Ces zones sont celles qui nous intéressent le plus car directement situé sur l'île. Cependant, nous utiliserons également 2 zones supplémentaires dans l'océan pacifique où se trouve le plus grand nombre d'EQ.  
  
![carte_japon.jpg](carte_japon.jpg)

zone A : île nord (Hokkaido) 41°N à 46°N et 140°E à 146°E  
zone B : Tohoku district, 37°N à 41°N et 138°E à 142°E  
zone C : Kanto et Chubu districts, 34°N à 37°N et 137°E à 141°E  
zone D : Kinki, Chugoku et Shikoku districts, 33°N à 37°N et 133°E à 137°E  
zone E : Kyushu district, 31°N à 36°N et 129°E à 133°E  
zone F : 38°N à 41°N et 142°E à 144°E  
zone G : 36°N à 38°N et 141°E à 143°E

In [7]:
data.head()

Unnamed: 0,time,latitude,longitude,depth,mag,zone
0,2021-12-01,35.228,141.4204,10.0,4.7,0
1,2021-12-01,35.2774,141.5336,10.0,4.7,0
2,2021-12-01,33.769,137.1036,362.75,4.0,0
3,2021-12-01,35.2014,141.4165,10.0,4.5,0
4,2021-12-01,35.143,141.3281,10.0,4.9,0


In [8]:
data.zone.value_counts()

0    14129
C     4334
A     3928
F     3589
B     3170
G     3039
E      989
D      890
Name: zone, dtype: int64

In [9]:
data[data.mag >= 4.5].zone.value_counts()

0    7659
F    2284
C    2071
A    2018
B    1721
G    1675
E     528
D     302
Name: zone, dtype: int64

On va s'intéresser uniquement au sinistre de mag >= 4.5 dans les zones précèdentes

In [10]:
df = data[(data.mag >= 4.5) & (data.zone != 0)]

In [11]:
df["time"] = pd.to_datetime(df.time)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [79]:
df

Unnamed: 0,time,latitude,longitude,depth,mag,zone
13,2021-12-01,36.1875,140.0911,51.68,5.0,C
20,2021-12-02,37.7216,141.7845,57.22,4.8,B
23,2021-12-02,35.5460,139.0340,26.64,4.5,C
24,2021-12-02,45.8325,142.5511,314.40,4.6,A
26,2021-12-03,33.8337,135.0775,12.08,5.2,D
...,...,...,...,...,...,...
34063,1960-03-23,39.1370,143.5980,25.00,5.9,F
34064,1960-03-23,39.1460,143.5640,25.00,5.8,F
34065,1960-03-23,39.2350,143.3030,15.00,6.2,F
34066,1960-02-04,38.6670,143.0140,25.00,6.1,F


On récupère le nombre d'EQ par mois

In [12]:
import numpy as np

In [13]:
zone_to_col = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6}

def to_full_dataset():
    T = (2022 - 1960) * 12  # nb de mois total
    M = 7  # nb de zones
    X_full = np.zeros((T, M), dtype=np.uint8)  # matrice contenant le nb d'EQ avec en ligne les mois par ordre chronologique et en colonne les zones
    for tup in df.itertuples():
        irow = (tup.time.year - 1960)* 12 + tup.time.month - 1
        icol = zone_to_col[tup.zone]
        X_full[irow, icol] += 1
    
    return X_full

In [14]:
X_full = to_full_dataset()

In [15]:
X_full.shape

(744, 7)

On sépare les données en train set et test set en prenant les données de 1960 à 2014 pour le train set et le reste pour le test set

In [16]:
X_train = X_full[:55*12]
X_train.shape

(660, 7)

In [17]:
X_test = X_full[55*12:]
X_test.shape

(84, 7)

On va séparer les inputs des outputs. On va utiliser une lookback window de $\tau = 60 \text{ mois}$. On n eprend pas de padding, ce qui veut dire que les inputs commencent à $ t = \tau+ 1 $

In [18]:
tau = 60
X_0 = X_test[:tau]
X_0.shape

(60, 7)

In [19]:
y_0 = X_test[tau + 1]
y_0.shape

(7,)

In [114]:
np.array([X_train[6,:], X_train[189]])

array([[0, 0, 0, 0, 0, 1, 0],
       [5, 1, 0, 0, 1, 0, 0]], dtype=uint8)

In [113]:
X_train[:4]

array([[ 0,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0],
       [ 0,  0,  0,  0,  0, 10,  0],
       [ 0,  0,  0,  0,  0,  0,  0]], dtype=uint8)

In [20]:
def split_input(dataset: np.array, tau: int, padding=False):
    """Convert dataset to input output arrays for time series prediction
    params: dataset: the dataset to perform the split with. Time component in dimension 0
            tau: lookback window ( tau < len(dataset) )
    returns: X, y: input and output array of shape (n - tau - 1, tau, n_features) and (n - tau - 1, n_features)"""
    X, Y = [], []
    n = len(dataset)
    assert tau < n, "tau > len(dataset)"
    assert tau > 0, "padding < 1"

    # without padding
    for i in range(n - tau - 1):
        X.append(dataset[i:i + tau])
        Y.append(dataset[i + tau + 1])
    
    return np.array(X), np.array(Y)


In [21]:
X, y = split_input(X_train, 60)

In [22]:
X.shape

(599, 60, 7)

Pour résumer, $X$ contient donc des matrices de taille (60, 7) avec 60 la taille de la lookback window et 7 le nombre de features (ici les zones).  
On essaiera plus tard d'avoir un nombre de feature en 2D comme first model

## Naïve Model

Avec ce type de données, on fait façe à un problème de multi output regression. De plus, les valeurs souhaitées doivent être positives et entières, ce qui nous oriente sur une fonction d'activation relu.

In [64]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

n_features = X.shape[2]

model = Sequential()
model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(tau, n_features), dropout=.4))
model.add(LSTM(128, activation='tanh', dropout=.4))
model.add(Dense(n_features, activation='relu'))
model.compile(optimizer='adam', loss='mse')

In [53]:
model.output_shape

(None, 7)

In [56]:
model.input_shape

(None, 60, 7)

In [37]:
from tensorflow import keras

In [38]:
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=10)

In [49]:
history = model.fit(X, y, epochs=200, verbose=1, callbacks=[callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [54]:
y_hat = model.predict(X[200])

ValueError: in user code:

    File "C:\Users\asus\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\asus\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\asus\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\asus\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "C:\Users\asus\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\asus\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_4" is incompatible with the layer: expected shape=(None, 60, 7), found shape=(None, 7)


In [51]:
np.floor(y_hat)

array([[2., 1., 2., 0., 0., 1., 1.],
       [2., 1., 2., 0., 0., 2., 1.],
       [2., 1., 2., 0., 0., 2., 1.],
       [2., 1., 2., 0., 0., 2., 1.],
       [2., 1., 2., 0., 0., 2., 1.],
       [2., 1., 2., 0., 0., 1., 1.],
       [3., 1., 2., 0., 0., 1., 1.],
       [4., 1., 1., 0., 0., 1., 1.],
       [4., 1., 1., 0., 0., 2., 1.],
       [5., 2., 2., 0., 0., 2., 1.],
       [4., 3., 2., 0., 0., 3., 2.],
       [3., 2., 2., 0., 0., 2., 1.],
       [3., 2., 2., 0., 0., 2., 1.],
       [3., 2., 2., 0., 0., 2., 1.],
       [3., 2., 2., 0., 0., 2., 1.],
       [2., 2., 2., 0., 0., 2., 1.],
       [3., 1., 2., 0., 0., 2., 1.],
       [3., 1., 2., 0., 0., 2., 1.],
       [3., 2., 2., 0., 0., 2., 1.],
       [3., 2., 2., 0., 0., 2., 1.],
       [3., 2., 2., 0., 0., 2., 1.],
       [5., 1., 2., 0., 1., 1., 1.],
       [7., 1., 1., 0., 0., 1., 1.],
       [7., 1., 1., 0., 0., 1., 1.],
       [6., 1., 1., 0., 0., 1., 1.],
       [5., 2., 1., 0., 0., 1., 1.],
       [5., 2., 1., 0., 0., 1., 1.],
 

In [72]:
from keras.layers import RepeatVector, TimeDistributed

n_steps_out = 1

model2 = Sequential()
model2.add(LSTM(128, activation='tanh', input_shape=(tau, n_features), dropout=.4))
model2.add(RepeatVector(n_steps_out))
model2.add(LSTM(128, activation='tanh', return_sequences=True, dropout=.4))
model2.add(TimeDistributed(Dense(n_features, activation='relu')))
model2.compile(optimizer='adam', loss='mse')

In [73]:
history2 = model2.fit(X, y, epochs=200, verbose=1, callbacks=[callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200


In [84]:
y_hat2 = model2.predict(X[200:201])

In [85]:
y_hat2

array([[[3.089209 , 2.4493206, 3.2040713, 0.4490118, 0.6375236,
         3.246378 , 1.7775995]]], dtype=float32)

In [87]:
np.round(y_hat2)

array([[[3., 2., 3., 0., 1., 3., 2.]]], dtype=float32)

In [88]:
y[200]

array([2, 0, 1, 0, 0, 2, 1], dtype=uint8)

In [None]:
y_hat_f = model.predict(X_)