In [15]:
STATUE_OF_LIBERTY = (40.689224, -74.044582)
EMPIRE_STATE_BUILDING = (40.748431, -73.985545)
TIMES_SQUARE = (40.755836, -73.986393)
MADAME_TUSSAUDS_MUSEUM = (40.756766, -73.988119)
BROOKLYN_BRIDGE = (40.705685, -73.996442)
CENTRAL_PARK = (40.784382, -73.965565)
CENTRAL_STATION = (40.752667, -73.977362)
METROPOLITAN_MUSEUM_OF_ART = (40.779150, -73.963398)
CHINATOWN = (40.714732, -73.997125)
CONEY_ISLAND = (40.575887, -73.991741)
BRIGHTON_BEACH = (40.579228, -73.961183)
FIFTH_AVENUE_STREET = (40.755951, -73.979155)
PUBLIC_LIBRARY = (40.752546, -73.981543)
BROADWAY_STREET = (40.746683, -73.891437)
HIGH_LINE_PARK = (40.750481, -74.002921)
BRYANT_PARK = (40.753689, -73.983677)
CHRYSLER_BUILDING = (40.751449, -73.975379)
ROCKEFELLER_CENTER = (40.759238, -73.979701)
TOP_OF_THE_ROCK_OBSERVATION_DECK = (40.758972, -73.979437)
RADIO_CITY_MUSIC_HALL = (40.759851, -73.979589)
FLATIRON_BUILDING_IRON_BUILDING = (40.740989, -73.989589)
WALL_STREET = (40.706981, -74.008976)
SEPTEMBER_11_MEMORIAL = (40.711549, -74.013295)
INTREPID_MUSEUM_OF_SEA_AIR_AND_SPACE = (40.711549, -74.013295)
MOMA_MUSEUM_OF_MODERN_ART = (40.761084, -73.976844)
FRICK_ART_MUSEUM_COLLECTION = (40.771119, -73.967214)
METROPOLITAN_OPERA = (40.772645, -73.984583)
AMERICAN_MUSEUM_OF_NATURAL_HISTORY = (40.780647, -73.974564)
ST_PATRICKS_CATHEDRAL = (40.758407, -73.976216)
ST_JOHN_THE_THEOLOGIAN_CATHEDRAL = (40.803767, -73.961661)
ELLIS_ISLAND = (40.803767, -73.961661)
OBSERVATION_DECK_VESSEL = (40.753795, -74.002231)
EDGE_OBSERVATION_DECK = (40.754185, -74.000957)
WORLD_TRADE_CENTER_ONE = (40.712826, -74.013257)
NEW_YORK_BOTANICAL_GARDEN = (40.863983, -73.881898)
BROOKLYN_MUSEUM = (40.670816, -73.963703)
CLOISTERS_MUSEUM = (40.864840, -73.931998)
WHITNEY_MUSEUM_OF_AMERICAN_ART = (40.739722, -74.008918)
STATEN_ISLAND_FERRY = (40.700965, -74.013028)

popular_places = [
    STATUE_OF_LIBERTY,
    EMPIRE_STATE_BUILDING,
    TIMES_SQUARE,
    MADAME_TUSSAUDS_MUSEUM,
    BROOKLYN_BRIDGE,
    CENTRAL_PARK,
    CENTRAL_STATION,
    METROPOLITAN_MUSEUM_OF_ART,
    CHINATOWN,
    # CONEY_ISLAND,
    BRIGHTON_BEACH,
    FIFTH_AVENUE_STREET,
    PUBLIC_LIBRARY,
    BROADWAY_STREET,
    HIGH_LINE_PARK,
    BRYANT_PARK,
    # CHRYSLER_BUILDING,
    # ROCKEFELLER_CENTER,
    TOP_OF_THE_ROCK_OBSERVATION_DECK,
    # RADIO_CITY_MUSIC_HALL,
    # FLATIRON_BUILDING_IRON_BUILDING,
    WALL_STREET,
    SEPTEMBER_11_MEMORIAL,
    INTREPID_MUSEUM_OF_SEA_AIR_AND_SPACE,
    MOMA_MUSEUM_OF_MODERN_ART,
    FRICK_ART_MUSEUM_COLLECTION,
    METROPOLITAN_OPERA,
    AMERICAN_MUSEUM_OF_NATURAL_HISTORY,
    ST_PATRICKS_CATHEDRAL,
    ST_JOHN_THE_THEOLOGIAN_CATHEDRAL,
    # ELLIS_ISLAND,
    OBSERVATION_DECK_VESSEL,
    EDGE_OBSERVATION_DECK,
    WORLD_TRADE_CENTER_ONE,
    NEW_YORK_BOTANICAL_GARDEN,
    BROOKLYN_MUSEUM,
    CLOISTERS_MUSEUM,
    # WHITNEY_MUSEUM_OF_AMERICAN_ART,
    # STATEN_ISLAND_FERRY
]


import pandas as pd
import numpy as np

train_path = 'data/new-york-city-taxi-fare-prediction/train.csv'
test_path = 'data/new-york-city-taxi-fare-prediction/test.csv'

# Загрузка данных
train_data = pd.read_csv(train_path, nrows=3000000)  # Ограничиваем размер выборки для скорости
test_data = pd.read_csv(test_path)

from geopy.distance import geodesic
from pandas.tseries.holiday import USFederalHolidayCalendar

def preprocess_data(df):
    # Удаляем некорректные координаты
    print('Удаляем некорректные координаты...')
    df = df[
        (df['pickup_latitude'].between(-90, 90)) & 
        (df['dropoff_latitude'].between(-90, 90)) & 
        (df['pickup_longitude'].between(-180, 180)) & 
        (df['dropoff_longitude'].between(-180, 180))
    ].dropna(subset=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'])

    # Преобразуем координаты в float
    print('Преобразуем координаты в float...')
    df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']] = \
        df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].astype(float)

    # Функция Haversine distance
    def haversine_distance(lat1, lon1, lat2, lon2):
        R = 6371
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
        return 2 * R * np.arcsin(np.sqrt(a))

    print('Haversine distance...')
    df['haversine_distance'] = haversine_distance(
        df['pickup_latitude'], df['pickup_longitude'],
        df['dropoff_latitude'], df['dropoff_longitude']
    )

    # Manhattan distance
    print('Manhattan distance...')
    df['manhattan_distance'] = (
        abs(df['pickup_latitude'] - df['dropoff_latitude']) +
        abs(df['pickup_longitude'] - df['dropoff_longitude'])
    ) * 111

    # Признак направления движения (bearing)
    def bearing(lat1, lon1, lat2, lon2):
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        x = np.sin(dlon) * np.cos(lat2)
        y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
        return np.degrees(np.arctan2(x, y))

    print('Признак направления движения (bearing)...')
    df['bearing'] = bearing(
        df['pickup_latitude'], df['pickup_longitude'],
        df['dropoff_latitude'], df['dropoff_longitude']
    )

    # Координаты аэропортов и центра города
    JFK = (40.6413, -73.7781)
    LGA = (40.7769, -73.8740)
    EWR = (40.6895, -74.1745)
    DOWNTOWN = (40.7580, -73.9855)


    def is_near_location(lat, lon, location, threshold=1):
        return int(geodesic((lat, lon), location).km < threshold)


    print("Признаки близости к популярным местам...")
    df['pickup_near_airport'] = df.apply(
        lambda row: is_near_location(row['pickup_latitude'], row['pickup_longitude'], JFK) or
                    is_near_location(row['pickup_latitude'], row['pickup_longitude'], LGA) or
                    is_near_location(row['pickup_latitude'], row['pickup_longitude'], EWR), axis=1
    )

    df['dropoff_near_airport'] = df.apply(
        lambda row: is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], JFK) or
                    is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], LGA) or
                    is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], EWR), axis=1
    )

    df['pickup_near_downtown'] = df.apply(
        lambda row: is_near_location(row['pickup_latitude'], row['pickup_longitude'], DOWNTOWN), axis=1
    )

    df['dropoff_near_downtown'] = df.apply(
        lambda row: is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], DOWNTOWN), axis=1
    )

    global_vars = globals()
    features_popular_places = []
    for name, value in global_vars.items():
        if isinstance(value, tuple) and len(value) == 2 and all(isinstance(x, float) for x in value) and value in popular_places:
            pickup_near = f'pickup_near_{name}'
            df[pickup_near] = df.apply(
                lambda row: is_near_location(row['pickup_latitude'], row['pickup_longitude'], value), axis=1
            )
            features_popular_places.append(pickup_near)
            
            dropoff_near = f'dropoff_near_{name}'
            df[f'dropoff_near_{name}'] = df.apply(
                lambda row: is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], value), axis=1
            )
            features_popular_places.append(dropoff_near)

    # Извлекаем временные признаки (если есть столбец 'pickup_datetime')
    if 'pickup_datetime' in df.columns:
        print("Извлекаем временные признаки...")
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
        df['hour'] = df['pickup_datetime'].dt.hour
        df['day_of_week'] = df['pickup_datetime'].dt.weekday
        df['month'] = df['pickup_datetime'].dt.month
        
        df['minute'] = df['pickup_datetime'].dt.minute
        df['day_of_month'] = df['pickup_datetime'].dt.day
        df['year'] = df['pickup_datetime'].dt.year
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['is_night_trip'] = ((df['hour'] >= 23) | (df['hour'] <= 5)).astype(int)
        df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 16) & (df['hour'] <= 19))
        df['season'] = df['month'] % 12 // 3 + 1
        
        holidays = USFederalHolidayCalendar().holidays(start='2009-01-01', end='2015-12-31')
        df['is_holiday'] = df['pickup_datetime'].dt.date.isin(holidays).astype(int)

    return df, features_popular_places


(train_data, features_popular_places) = preprocess_data(train_data)
(test_data, f) = preprocess_data(test_data)



features = [
    'haversine_distance', 'manhattan_distance', 'bearing',
    'pickup_near_airport', 'dropoff_near_airport',
    'pickup_near_downtown', 'dropoff_near_downtown',
    'hour', 'day_of_week', 'month', 'minute', 'day_of_month', 
    'year', 'is_weekend', 'is_night_trip', 'is_rush_hour', 'season'
]

features = features + features_popular_places

X = train_data[features]
y = train_data['fare_amount']


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучаем модель CatBoost
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=1000, depth=7, learning_rate=0.1, loss_function='RMSE', verbose=200)
model.fit(X_train, y_train)

# Оцениваем модель
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

X_test = test_data[features]

# Делаем предсказание
test_data['fare_amount'] = model.predict(X_test)

# Сохраняем результат
submission = test_data[['key', 'fare_amount']]
submission.to_csv("submission_10000.csv", index=False)

print("Предсказания сохранены в submission_03_3000000.csv!")

Удаляем некорректные координаты...
Преобразуем координаты в float...
Haversine distance...
Manhattan distance...
Признак направления движения (bearing)...
Признаки близости к популярным местам...


: 

In [12]:
a = ['123', '24']
b = ['34', '445']

print(a + b)

['123', '24', '34', '445']


In [8]:
STATUE_OF_LIBERTY = (40.689224, -74.044582)
EMPIRE_STATE_BUILDING = (40.748431, -73.985545)
TIMES_SQUARE = (40.755836, -73.986393)
MADAME_TUSSAUDS_MUSEUM = (40.756766, -73.988119)
BROOKLYN_BRIDGE = (40.705685, -73.996442)
CENTRAL_PARK = (40.784382, -73.965565)
CENTRAL_STATION = (40.752667, -73.977362)
METROPOLITAN_MUSEUM_OF_ART = (40.779150, -73.963398)
CHINATOWN = (40.714732, -73.997125)
CONEY_ISLAND = (40.575887, -73.991741)
BRIGHTON_BEACH = (40.579228, -73.961183)
FIFTH_AVENUE_STREET = (40.755951, -73.979155)
PUBLIC_LIBRARY = (40.752546, -73.981543)
BROADWAY_STREET = (40.746683, -73.891437)
HIGH_LINE_PARK = (40.750481, -74.002921)
BRYANT_PARK = (40.753689, -73.983677)
CHRYSLER_BUILDING = (40.751449, -73.975379)
ROCKEFELLER_CENTER = (40.759238, -73.979701)
TOP_OF_THE_ROCK_OBSERVATION_DECK = (40.758972, -73.979437)
RADIO_CITY_MUSIC_HALL = (40.759851, -73.979589)
FLATIRON_BUILDING_IRON_BUILDING = (40.740989, -73.989589)
WALL_STREET = (40.706981, -74.008976)
SEPTEMBER_11_MEMORIAL = (40.711549, -74.013295)
INTREPID_MUSEUM_OF_SEA_AIR_AND_SPACE = (40.711549, -74.013295)
MOMA_MUSEUM_OF_MODERN_ART = (40.761084, -73.976844)
FRICK_ART_MUSEUM_COLLECTION = (40.771119, -73.967214)
METROPOLITAN_OPERA = (40.772645, -73.984583)
AMERICAN_MUSEUM_OF_NATURAL_HISTORY = (40.780647, -73.974564)
ST_PATRICKS_CATHEDRAL = (40.758407, -73.976216)
ST_JOHN_THE_THEOLOGIAN_CATHEDRAL = (40.803767, -73.961661)
ELLIS_ISLAND = (40.803767, -73.961661)
OBSERVATION_DECK_VESSEL = (40.753795, -74.002231)
EDGE_OBSERVATION_DECK = (40.754185, -74.000957)
WORLD_TRADE_CENTER_ONE = (40.712826, -74.013257)
NEW_YORK_BOTANICAL_GARDEN = (40.863983, -73.881898)
BROOKLYN_MUSEUM = (40.670816, -73.963703)
CLOISTERS_MUSEUM = (40.864840, -73.931998)
WHITNEY_MUSEUM_OF_AMERICAN_ART = (40.739722, -74.008918)
STATEN_ISLAND_FERRY = (40.700965, -74.013028)

popular_places = [
    STATUE_OF_LIBERTY,EMPIRE_STATE_BUILDING,TIMES_SQUARE,MADAME_TUSSAUDS_MUSEUM,BROOKLYN_BRIDGE,CENTRAL_PARK,
    CENTRAL_STATION,METROPOLITAN_MUSEUM_OF_ART,CHINATOWN,CONEY_ISLAND,BRIGHTON_BEACH,FIFTH_AVENUE_STREET,
    PUBLIC_LIBRARY,BROADWAY_STREET,HIGH_LINE_PARK,BRYANT_PARK,CHRYSLER_BUILDING,ROCKEFELLER_CENTER,TOP_OF_THE_ROCK_OBSERVATION_DECK,
    RADIO_CITY_MUSIC_HALL,FLATIRON_BUILDING_IRON_BUILDING,WALL_STREET,SEPTEMBER_11_MEMORIAL,INTREPID_MUSEUM_OF_SEA_AIR_AND_SPACE,
    MOMA_MUSEUM_OF_MODERN_ART,FRICK_ART_MUSEUM_COLLECTION,METROPOLITAN_OPERA,AMERICAN_MUSEUM_OF_NATURAL_HISTORY,ST_PATRICKS_CATHEDRAL,
    ST_JOHN_THE_THEOLOGIAN_CATHEDRAL,ELLIS_ISLAND,OBSERVATION_DECK_VESSEL,EDGE_OBSERVATION_DECK,WORLD_TRADE_CENTER_ONE,
    NEW_YORK_BOTANICAL_GARDEN,BROOKLYN_MUSEUM,CLOISTERS_MUSEUM,WHITNEY_MUSEUM_OF_AMERICAN_ART,STATEN_ISLAND_FERRY
]

global_vars = globals()

for name, value in global_vars.items():
    if isinstance(value, tuple) and len(value) == 2 and all(isinstance(x, float) for x in value) and value in popular_places:
        print(name)

STATUE_OF_LIBERTY
EMPIRE_STATE_BUILDING
TIMES_SQUARE
MADAME_TUSSAUDS_MUSEUM
BROOKLYN_BRIDGE
CENTRAL_PARK
CENTRAL_STATION
METROPOLITAN_MUSEUM_OF_ART
CHINATOWN
CONEY_ISLAND
BRIGHTON_BEACH
FIFTH_AVENUE_STREET
PUBLIC_LIBRARY
BROADWAY_STREET
HIGH_LINE_PARK
BRYANT_PARK
CHRYSLER_BUILDING
ROCKEFELLER_CENTER
TOP_OF_THE_ROCK_OBSERVATION_DECK
RADIO_CITY_MUSIC_HALL
FLATIRON_BUILDING_IRON_BUILDING
WALL_STREET
SEPTEMBER_11_MEMORIAL
INTREPID_MUSEUM_OF_SEA_AIR_AND_SPACE
MOMA_MUSEUM_OF_MODERN_ART
FRICK_ART_MUSEUM_COLLECTION
METROPOLITAN_OPERA
AMERICAN_MUSEUM_OF_NATURAL_HISTORY
ST_PATRICKS_CATHEDRAL
ST_JOHN_THE_THEOLOGIAN_CATHEDRAL
ELLIS_ISLAND
OBSERVATION_DECK_VESSEL
EDGE_OBSERVATION_DECK
WORLD_TRADE_CENTER_ONE
NEW_YORK_BOTANICAL_GARDEN
BROOKLYN_MUSEUM
CLOISTERS_MUSEUM
WHITNEY_MUSEUM_OF_AMERICAN_ART
STATEN_ISLAND_FERRY
