In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import holidays
from collections import Counter

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from geopy.distance import geodesic # геодезическое расстояние между точками по поверхности Земли

from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

# просмотр данных

In [None]:
df = pd.read_csv('train.csv')

In [None]:
len(df)

In [None]:
df = df.loc[:len(df)//100]
len(df)

In [None]:
df

# первичная обработка данных

In [None]:
df.fare_amount.min(), df.fare_amount.max(), df.fare_amount.mean(), df.fare_amount.median()

In [None]:
df = df[(df['fare_amount']>=1) & (df['fare_amount']<=df.fare_amount.mean()*10)] # убираем неадектные ценники

In [None]:
(df.dropoff_longitude.min(), df.dropoff_longitude.max()), (df.pickup_latitude.min(), df.pickup_latitude.max())

In [None]:
len(df)

In [None]:
# убираем неверные координаты
df = df[(df['dropoff_latitude']<=90) & (df['dropoff_latitude']>=-90) & (df['dropoff_longitude']<=90) & (df['dropoff_longitude']>=-90)]

In [None]:
# убираем неверные координаты
df = df[(df['pickup_latitude']<=90) & (df['pickup_latitude']>=-90) & (df['pickup_longitude']<=90) & (df['pickup_longitude']>=-90)]

In [None]:
len(df)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
# заполняем пропуски 
imp = IterativeImputer(random_state=42)
df['dropoff_longitude'] = imp.fit_transform(df['dropoff_longitude'].to_numpy().reshape(len(df), 1))

In [None]:
# заплняем пропуски
imp = IterativeImputer(random_state=42)
df['dropoff_latitude'] = imp.fit_transform(df['dropoff_latitude'].to_numpy().reshape(len(df), 1))

In [None]:
df.isna().sum()

# feature engeneering

In [None]:
# расстояние до центра города
moscow_center = (40.646746, -73.789962) # аэропорт нью-йорка
df['distance_to_airport'] = df.apply(lambda row: geodesic((row['dropoff_latitude'], row['dropoff_longitude']), moscow_center).km, axis=1)

In [None]:
# Кластеризация K-средних
kmeans = KMeans(n_clusters=4)
df['cluster'] = kmeans.fit_predict(df[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
# Признаки синус-косинус для направления
df['lat_sin'] = np.sin(np.radians(df['dropoff_latitude']))
df['lat_cos'] = np.cos(np.radians(df['dropoff_latitude']))
df['lon_sin'] = np.sin(np.radians(df['dropoff_longitude']))
df['lon_cos'] = np.cos(np.radians(df['dropoff_longitude']))

In [None]:
dist = 1
df['airport_indicator'] = df.apply(lambda row: int((row['distance_to_airport'] <= dist)), axis=1)

In [None]:
# считаем расстояние между конечной и первоначальной точкамиs
df['distance'] = df.apply(lambda row: geodesic((row['pickup_latitude'], row['pickup_longitude']), (row['dropoff_latitude'], row['dropoff_longitude'])).km, axis=1)

In [None]:
# приведение к времени нью йорка
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']).dt.tz_convert('America/New_York')
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']).dt.tz_convert(None)
df.info()

In [None]:
# определение является ли день праздником в сша
us_holidays = holidays.US()

def is_holiday(date):
    return int(date in us_holidays)

df['is_holiday'] = df['pickup_datetime'].dt.date.apply(is_holiday)

In [None]:
# определение дня недели
df['weekday'] = df['pickup_datetime'].dt.weekday
# Создаем новые колонки
df['minute'] = df['pickup_datetime'].dt.minute
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year

In [None]:
# Извлечение даты и часа
df['date'] = df['pickup_datetime'].dt.date

# Подсчёт количества поездок по датам и часам
hourly_load = df.groupby(['date', 'hour']).size().reset_index(name='load')

# Объединение исходного датафрейма с данными о загрузке
df = pd.merge(df, hourly_load, on=['date', 'hour'], how='left')

In [None]:
df

In [None]:
# удаление ненужных колонок
# df.drop(columns=['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',	'dropoff_latitude'], inplace=True)
df.drop(columns=['pickup_datetime', 'date'], inplace=True)

In [None]:
# df.drop(columns=['load'], inplace=True)

In [None]:
df

# подготовка данных для обучения

In [None]:
X = df.drop(columns=['fare_amount', 'key'])
X

In [None]:
y = df['fare_amount']
y

In [None]:
quantile_bins = pd.qcut(y, q=20, precision=1, labels=False)
quantile_bins.head(), quantile_bins.tail()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=quantile_bins)

# создание модели

## base catboost

In [None]:
cat_base = CatBoostRegressor(
    iterations=10000,    
    depth=None,
    eval_metric='RMSE',     # Метрика качества для регрессии
    random_seed=42,         # Сид для повторяемости результатов
    od_wait=1000,
    use_best_model=True,    
    verbose=200,             # Частота вывода информации о процессе обучения
    task_type='CPU'
)

In [None]:
cat_base.fit(X_train, y_train, eval_set=(X_test, y_test))

In [None]:
predictions = cat_base.predict(X_test)

In [None]:
mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions), r2_score(y_test, predictions)

## catboosts ensemle

In [None]:
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.5, 
                                                                random_state=42)

In [None]:
X_train_11, X_train_12, y_train_11, y_train_12 = train_test_split(X_train_1,
                                                                y_train_1,
                                                                test_size=0.5, 
                                                                random_state=42)

In [None]:
X_train_21, X_train_22, y_train_21, y_train_22 = train_test_split(X_train_2,
                                                                y_train_2,
                                                                test_size=0.5, 
                                                                random_state=42)

In [None]:
model_1 = CatBoostRegressor(iterations=1000, depth=8, random_seed=42, verbose=0, task_type='GPU')
model_1.fit(X_train_11, y_train_11)

In [None]:
model_2 = CatBoostRegressor(iterations=1000, depth=6, random_seed=42, verbose=0, task_type='GPU')
model_2.fit(X_train_12, y_train_12)

In [None]:
model_3 = CatBoostRegressor(iterations=1000, depth=10, random_seed=42, verbose=0, task_type='GPU')
model_3.fit(X_train_21, y_train_21)

In [None]:
model_4 = CatBoostRegressor(iterations=1000, depth=12, random_seed=42, verbose=0, task_type='GPU')
model_4.fit(X_train_22, y_train_22)

In [None]:
ensemble_model_voting = VotingRegressor(estimators=[
    ('catboost_1', model_1),
    ('catboost_2', model_2),
    ('catboost_3', model_3),
    ('catboost_4', model_4)],
    # n_jobs=-1
    )

# Обучение ансамбля
ensemble_model_voting.fit(X_train, y_train)

In [None]:
predictions = ensemble_model_voting.predict(X_test)
mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions), r2_score(y_test, predictions)

# создание предикта

In [None]:
pred_df = pd.read_csv('test.csv')
pred_df

In [None]:
pred_df.isna().sum()

In [None]:
# расстояние до центра города
moscow_center = (40.646746, -73.789962)
pred_df['distance_to_airport'] = pred_df.apply(lambda row: geodesic((row['dropoff_latitude'], row['dropoff_longitude']), moscow_center).km, axis=1)

In [None]:
# Кластеризация K-средних
kmeans = KMeans(n_clusters=4)
pred_df['cluster'] = kmeans.fit_predict(pred_df[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
# Признаки синус-косинус для направления
pred_df['lat_sin'] = np.sin(np.radians(pred_df['dropoff_latitude']))
pred_df['lat_cos'] = np.cos(np.radians(pred_df['dropoff_latitude']))
pred_df['lon_sin'] = np.sin(np.radians(pred_df['dropoff_longitude']))
pred_df['lon_cos'] = np.cos(np.radians(pred_df['dropoff_longitude']))

In [None]:
dist = 1
pred_df['airport_indicator'] = pred_df.apply(lambda row: int((row['distance_to_airport'] <= dist)), axis=1)

In [None]:
# считаем расстояние между конечной и первоначальной точкамиs
pred_df['distance'] = pred_df.apply(lambda row: geodesic((row['pickup_latitude'], row['pickup_longitude']), (row['dropoff_latitude'], row['dropoff_longitude'])).km, axis=1)

In [None]:
# приведение к времени нью йорка
pred_df['pickup_datetime'] = pd.to_datetime(pred_df['pickup_datetime']).dt.tz_convert('America/New_York')
pred_df['pickup_datetime'] = pd.to_datetime(pred_df['pickup_datetime']).dt.tz_convert(None)
pred_df.info()

In [None]:
# определение является ли день праздником в сша
us_holidays = holidays.US()

def is_holiday(date):
    return int(date in us_holidays)

pred_df['is_holiday'] = pred_df['pickup_datetime'].dt.date.apply(is_holiday)

In [None]:
# определение дня недели
pred_df['weekday'] = pred_df['pickup_datetime'].dt.weekday
# Создаем новые колонки
pred_df['minute'] = pred_df['pickup_datetime'].dt.minute
pred_df['hour'] = pred_df['pickup_datetime'].dt.hour
pred_df['day'] = pred_df['pickup_datetime'].dt.day
pred_df['month'] = pred_df['pickup_datetime'].dt.month
pred_df['year'] = pred_df['pickup_datetime'].dt.year

In [None]:
# Извлечение даты и часа
pred_df['date'] = pred_df['pickup_datetime'].dt.date

# Подсчёт количества поездок по датам и часам
hourly_load = pred_df.groupby(['date', 'hour']).size().reset_index(name='load')

# Объединение исходного датафрейма с данными о загрузке
pred_df = pd.merge(pred_df, hourly_load, on=['date', 'hour'], how='left')

In [None]:
# удаление ненужных колонок
# pred_df.drop(columns=['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',	'dropoff_latitude'], inplace=True)
pred_df.drop(columns=['pickup_datetime', 'date'], inplace=True)

In [None]:
pred_df

In [None]:
pred = ensemble_model_voting.predict(pred_df.drop(columns=['key']))

In [None]:
sample = pd.read_csv('sample_submission.csv')
sample

In [None]:
sample['fare_amount'] = pred
sample

In [None]:
sample.to_csv('sub.csv', index=False)