In [None]:
import pandas as pd
import numpy as np

train_path = 'data/new-york-city-taxi-fare-prediction/train.csv'
test_path = 'data/new-york-city-taxi-fare-prediction/test.csv'
# weather_path = 'data/New York 2013-01-01 to 2015-06-30.csv'

# Загрузка данных
train_data = pd.read_csv(train_path, nrows=3000000)  # Ограничиваем размер выборки для скорости
test_data = pd.read_csv(test_path)
# weather_data = pd.read_csv(weather_path)


from geopy.distance import geodesic
from pandas.tseries.holiday import USFederalHolidayCalendar

def preprocess_data(df):
    # Удаляем некорректные координаты
    print("Удаляем некорректные координаты...")
    df = df[
        (df['pickup_latitude'].between(-90, 90)) & 
        (df['dropoff_latitude'].between(-90, 90)) & 
        (df['pickup_longitude'].between(-180, 180)) & 
        (df['dropoff_longitude'].between(-180, 180))
    ].dropna(subset=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'])

    # Преобразуем координаты в float
    print("Преобразуем координаты в float...")
    df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']] = \
        df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].astype(float)

    # Функция Haversine distance
    def haversine_distance(lat1, lon1, lat2, lon2):
        R = 6371
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
        return 2 * R * np.arcsin(np.sqrt(a))

    print("Haversine distance...")
    df['haversine_distance'] = haversine_distance(
        df['pickup_latitude'], df['pickup_longitude'],
        df['dropoff_latitude'], df['dropoff_longitude']
    )

    # Manhattan distance
    print("Manhattan distance...")
    df['manhattan_distance'] = (
        abs(df['pickup_latitude'] - df['dropoff_latitude']) +
        abs(df['pickup_longitude'] - df['dropoff_longitude'])
    ) * 111

    # Признак направления движения (bearing)
    def bearing(lat1, lon1, lat2, lon2):
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        x = np.sin(dlon) * np.cos(lat2)
        y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
        return np.degrees(np.arctan2(x, y))

    print("Признак направления движения (bearing)...")
    df['bearing'] = bearing(
        df['pickup_latitude'], df['pickup_longitude'],
        df['dropoff_latitude'], df['dropoff_longitude']
    )

    # Координаты аэропортов и центра города
    print("Координаты аэропортов и центра города...")
    JFK = (40.6413, -73.7781)
    LGA = (40.7769, -73.8740)
    EWR = (40.6895, -74.1745)
    DOWNTOWN = (40.7580, -73.9855)

    def is_near_location(lat, lon, location, threshold=1):
        return int(geodesic((lat, lon), location).km < threshold)

    df['pickup_near_airport'] = df.apply(
        lambda row: is_near_location(row['pickup_latitude'], row['pickup_longitude'], JFK) or
                    is_near_location(row['pickup_latitude'], row['pickup_longitude'], LGA) or
                    is_near_location(row['pickup_latitude'], row['pickup_longitude'], EWR), axis=1
    )

    df['dropoff_near_airport'] = df.apply(
        lambda row: is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], JFK) or
                    is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], LGA) or
                    is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], EWR), axis=1
    )

    df['pickup_near_downtown'] = df.apply(
        lambda row: is_near_location(row['pickup_latitude'], row['pickup_longitude'], DOWNTOWN), axis=1
    )

    df['dropoff_near_downtown'] = df.apply(
        lambda row: is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], DOWNTOWN), axis=1
    )

    # Извлекаем временные признаки (если есть столбец 'pickup_datetime')
    if 'pickup_datetime' in df.columns:
        print("Извлекаем временные признаки...")
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
        df['hour'] = df['pickup_datetime'].dt.hour
        df['day_of_week'] = df['pickup_datetime'].dt.weekday
        df['month'] = df['pickup_datetime'].dt.month
        
        df['minute'] = df['pickup_datetime'].dt.minute
        df['day_of_month'] = df['pickup_datetime'].dt.day
        df['year'] = df['pickup_datetime'].dt.year
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['is_night_trip'] = ((df['hour'] >= 23) | (df['hour'] <= 5)).astype(int)
        df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 16) & (df['hour'] <= 19))
        df['season'] = df['month'] % 12 // 3 + 1
        
        holidays = USFederalHolidayCalendar().holidays(start='2009-01-01', end='2015-12-31')
        df['is_holiday'] = df['pickup_datetime'].dt.date.isin(holidays).astype(int)

    return df


train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)




features = [
    'haversine_distance', 'manhattan_distance', 'bearing',
    'pickup_near_airport', 'dropoff_near_airport',
    'pickup_near_downtown', 'dropoff_near_downtown',
    'hour', 'day_of_week', 'month', 'minute', 'day_of_month', 
    'year', 'is_weekend', 'is_night_trip', 'is_rush_hour', 'season'
]

X = train_data[features]
y = train_data['fare_amount']


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучаем модель CatBoost
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=1000, depth=7, learning_rate=0.1, loss_function='RMSE', verbose=200)
model.fit(X_train, y_train)

# Оцениваем модель
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

X_test = test_data[features]

# Делаем предсказание
test_data['fare_amount'] = model.predict(X_test)

# Сохраняем результат
submission = test_data[['key', 'fare_amount']]
submission.to_csv("submission_04_3000000.csv", index=False)

print("Предсказания сохранены в submission.csv!")


Удаляем некорректные координаты...
Преобразуем координаты в float...
Haversine distance...
Manhattan distance...
Признак направления движения (bearing)...
Координаты аэропортов и центра города...
Извлекаем временные признаки...
Удаляем некорректные координаты...
Преобразуем координаты в float...
Haversine distance...
Manhattan distance...
Признак направления движения (bearing)...
Координаты аэропортов и центра города...
Извлекаем временные признаки...
0:	learn: 9.1675983	total: 217ms	remaining: 3m 36s
200:	learn: 4.5843924	total: 27.9s	remaining: 1m 51s
400:	learn: 4.5072497	total: 55.8s	remaining: 1m 23s
600:	learn: 4.4529324	total: 1m 23s	remaining: 55.7s
800:	learn: 4.4167017	total: 1m 52s	remaining: 28s
999:	learn: 4.3862734	total: 2m 20s	remaining: 0us
RMSE: 4.6548743395055245
Предсказания сохранены в submission.csv!


