In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from geopy.distance import geodesic # геодезическое расстояние между точками по поверхности Земли

import keras
from keras.models import Sequential  # Для создания и обучения последовательных нейронных сетей
from keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional

import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

# первичная обработка данных

In [None]:
df = pd.read_csv('train.csv')
df = df.loc[:len(df)//100]

In [None]:
(df.dropoff_longitude.min(), df.dropoff_longitude.max()), (df.dropoff_latitude.min(), df.dropoff_latitude.max())

In [None]:
df = df[(df['dropoff_latitude']<=90) & (df['dropoff_latitude']>=-90) & (df['dropoff_longitude']<=90) & (df['dropoff_longitude']>=-90)]

In [None]:
df.info()

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']).dt.tz_convert(None)
df.info()

In [None]:
df.isna().sum()

In [None]:
lst = ['dropoff_longitude', 'dropoff_latitude']
imp = IterativeImputer(random_state=42)

for row in tqdm(lst):
    df[row] = imp.fit_transform(df[row].to_numpy().reshape(len(df), 1))

df.isna().sum()

# feature engeneering

In [None]:
# расстояние до центра города
moscow_center = (40.646746, -73.789962) # аэропорт нью-йорка
df['distance_to_moscow_center'] = df.apply(lambda row: geodesic((row['dropoff_latitude'], row['dropoff_longitude']), moscow_center).km, axis=1)

In [None]:
# Кластеризация K-средних
kmeans = KMeans(n_clusters=2)
df['cluster'] = kmeans.fit_predict(df[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
# Признаки синус-косинус для направления
df['lat_sin'] = np.sin(np.radians(df['dropoff_latitude']))
df['lat_cos'] = np.cos(np.radians(df['dropoff_latitude']))
df['lon_sin'] = np.sin(np.radians(df['dropoff_longitude']))
df['lon_cos'] = np.cos(np.radians(df['dropoff_longitude']))

In [None]:
dist = 1
df['indicator'] = df.apply(lambda row: int((row['distance_to_moscow_center'] <= dist)), axis=1)

In [None]:
# Создаем новые колонки
df['minute'] = df['pickup_datetime'].dt.minute
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year
df.drop(columns=['pickup_datetime'], inplace=True)

In [None]:
df

# подготовка данных для обучения

In [None]:
X = df.drop(columns=['fare_amount', 'key']).to_numpy()
X

In [None]:
y = df['fare_amount'].to_numpy()
y

In [None]:
# y.max(), y.min()

In [None]:
quantile_bins = pd.qcut(y, q=10, precision=1, labels=False)
quantile_bins.head(), quantile_bins.tail()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=quantile_bins)

# создание модели

## keras lstm

In [None]:
# Изменим размерность для корректной работы модели с архитектурой "LSTM"
X_train_keras = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_keras = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [None]:
# Иницилизируем модель
keras_model = Sequential()
keras_model.add(BatchNormalization()) 
keras_model.add(Bidirectional(LSTM(50, activation='relu', input_shape=(X_train_keras.shape[1], X_train_keras.shape[2]))))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(1))
keras_model.compile(optimizer='adam', loss='mse')

In [None]:
# Сохранение наилучшей модели
callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath='keras_model.keras',
        # Путь по которому нужно сохранить модель
        # Два параметра ниже значат что мы перезапишем
        # текущий чекпоинт в том и только в том случае, когда
        # улучится значение `val_loss`.
        save_best_only=True,
        monitor='val_loss',
        verbose=100)
]

In [None]:
# Обучим модель
keras_model.fit(X_train_keras, y_train, epochs=10, batch_size=64, callbacks=callbacks, validation_data=(X_test_keras, y_test))

In [None]:
# Загрузка лучшей модели
keras_model = keras.models.load_model('keras_model.keras')

In [None]:
predictions =  keras_model.predict(X_test_keras) # предсказываем валидационную выборку

In [None]:
mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions), r2_score(y_test, predictions)

## torch lstm

In [None]:
class TorchDataset(Dataset):
    def __init__(self, X, y): 
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
t_dataset = TorchDataset(X = torch.FloatTensor(X_train),
                            y = torch.FloatTensor(y_train))
t_dataset[0]

In [None]:
v_dataset = TorchDataset(X = torch.FloatTensor(X_test),
                            y = torch.FloatTensor(y_test))
v_dataset[0]

In [None]:
tloader = DataLoader(t_dataset, batch_size=64)
next(iter(tloader))

In [None]:
vloader = DataLoader(v_dataset, batch_size=64)
next(iter(vloader))

In [None]:
class LSTM(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = torch.nn.LSTM(input_size=17, hidden_size=512, bidirectional=True, batch_first=True)
        self.x_dense = torch.nn.Linear(1024, 1024)

        self.relu = torch.nn.ReLU()

        self.fc = torch.nn.Linear(1024, 1)

    def forward(self, x):
        x = x.reshape((x.shape[0], 1, x.shape[-1]))

        x, _ = self.lstm(x)
        # print(x.shape)
        x = x[:, -1, :]
        x = self.x_dense(x)
        x = self.relu(x)

        y = self.fc(x)
        y = y.squeeze()
        
        return y

In [None]:
torch_model = LSTM()

In [None]:
# Пример входных данных
torch_model(next(iter(tloader))[0])

In [None]:
device = 'cuda'

In [None]:
torch_model.to(device)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(torch_model.parameters(), lr=3e-4)

In [None]:
def train(dataloader):
    loss_all = 0
    torch_model.train()
    for X, y in tqdm(dataloader):
        optimizer.zero_grad()
        X, y = X.to(device), y.to(device)
        pred = torch_model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        loss_all += loss.item()
    return f'Train MSE: {loss_all / len(dataloader)}'

def test(dataloader):
    loss_all = 0
    torch_model.eval()
    with torch.no_grad():
        for X, y in tqdm(dataloader):
            X, y = X.to(device), y.to(device)
            pred = torch_model(X)
            loss_all += loss_fn(pred, y).item()
    return f'Test MSE: {loss_all / len(dataloader)}'

In [None]:
text = ''
epochs = 1

for epoch in range(epochs):
    # clear_output()
    print(text)
    text = f'Epoch {epoch+1}\n{train(tloader)}\n{test(vloader)}'

print(text)

In [None]:
torch_model.to('cpu')
torch.save(torch_model, 'torch_model.pth')

In [None]:
torch_model = torch.load('torch_model.pth')
torch_model.to(device)

## создание предикта

In [None]:
pred_df = pd.read_csv('test.csv')
pred_df

In [None]:
pred_df['pickup_datetime'] = pd.to_datetime(pred_df['pickup_datetime']).dt.tz_convert(None)
pred_df.info()

In [None]:
# расстояние до центра города
moscow_center = (40.646746, -73.789962)
pred_df['distance_to_moscow_center'] = pred_df.apply(lambda row: geodesic((row['dropoff_latitude'], row['dropoff_longitude']), moscow_center).km, axis=1)

In [None]:
# Кластеризация K-средних
kmeans = KMeans(n_clusters=2)
pred_df['cluster'] = kmeans.fit_predict(pred_df[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
# Признаки синус-косинус для направления
pred_df['lat_sin'] = np.sin(np.radians(pred_df['dropoff_latitude']))
pred_df['lat_cos'] = np.cos(np.radians(pred_df['dropoff_latitude']))
pred_df['lon_sin'] = np.sin(np.radians(pred_df['dropoff_longitude']))
pred_df['lon_cos'] = np.cos(np.radians(pred_df['dropoff_longitude']))

In [None]:
dist = 1
pred_df['indicator'] = pred_df.apply(lambda row: int((row['distance_to_moscow_center'] <= dist)), axis=1)

In [None]:
# Создаем новые колонки
pred_df['minute'] = pred_df['pickup_datetime'].dt.minute
pred_df['hour'] = pred_df['pickup_datetime'].dt.hour
pred_df['day'] = pred_df['pickup_datetime'].dt.day
pred_df['month'] = pred_df['pickup_datetime'].dt.month
pred_df['year'] = pred_df['pickup_datetime'].dt.year
pred_df.drop(columns=['pickup_datetime'], inplace=True)

In [None]:
pred_df.drop(columns=['key'], inplace = True)

In [None]:
pred_df

предикт для кераса делается также как для catboost

In [None]:
pred = []

for i in tqdm(range(len(pred_df))):
    pred.append(torch_model(torch.FloatTensor(pred_df.loc[i].to_numpy().reshape(1,1,17)).to(device)).cpu().item())

len(pred), pred[0]

In [None]:
sample = pd.read_csv('sample_submission.csv')
sample

In [None]:
sample['fare_amount'] = pred
sample

In [None]:
sample.to_csv('sub.csv', index=False)