In [60]:
import pandas as pd
import numpy as np

train_path = 'data/new-york-city-taxi-fare-prediction/train.csv'
test_path = 'data/new-york-city-taxi-fare-prediction/test.csv'
weather_path = 'data/New York 2013-01-01 to 2015-06-30.csv'

# Загрузка данных
train_data = pd.read_csv(train_path, nrows=100000)  # Ограничиваем размер выборки для скорости
test_data = pd.read_csv(test_path)
weather_data = pd.read_csv(weather_path)

In [56]:
# Преобразуем в datetime, если это еще не сделано
train_data['pickup_datetime'] = pd.to_datetime(train_data['pickup_datetime'])
test_data['pickup_datetime'] = pd.to_datetime(test_data['pickup_datetime'])

# Находим минимальную и максимальную дату
min_date = train_data['pickup_datetime'].min()
max_date = train_data['pickup_datetime'].max()

test_min_date = test_data['pickup_datetime'].min()
test_max_date = test_data['pickup_datetime'].max()

print("train")
print(f"Самая ранняя дата: {min_date}")
print(f"Самая поздняя дата: {max_date}")
print("test")
print(f"Самая ранняя дата: {test_min_date}")
print(f"Самая поздняя дата: {test_max_date}")

weather_data.head(5)

train
Самая ранняя дата: 2009-01-01 00:41:00+00:00
Самая поздняя дата: 2015-06-30 22:54:07+00:00
test
Самая ранняя дата: 2009-01-01 11:04:24+00:00
Самая поздняя дата: 2015-06-30 20:03:50+00:00


Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,New York,2013-01-01,4.2,-2.5,2.7,0.9,-10.3,-1.3,-5.3,55.8,...,2.2,1,,2013-01-01T07:20:12,2013-01-01T16:39:31,0.65,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72505394728,KEWR,KLGA,72502014734,KNYC,7250301..."
1,New York,2013-01-02,0.4,-5.2,-2.4,-4.3,-10.9,-7.0,-11.6,49.6,...,8.2,4,,2013-01-02T07:20:17,2013-01-02T16:40:22,0.68,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72505394728,KEWR,KLGA,72502014734,KNYC,7250301..."
2,New York,2013-01-03,0.3,-3.8,-1.7,-1.9,-7.2,-5.5,-9.7,55.0,...,7.4,3,,2013-01-03T07:20:19,2013-01-03T16:41:16,0.72,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72505394728,KEWR,KLGA,72502014734,KNYC,7250301..."
3,New York,2013-01-04,3.0,-0.6,1.3,-1.5,-4.8,-3.1,-7.1,53.8,...,9.5,5,,2013-01-04T07:20:20,2013-01-04T16:42:10,0.75,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72505394728,KEWR,KLGA,72502014734,KNYC,7250301..."
4,New York,2013-01-05,6.0,0.2,2.7,4.3,-4.9,-0.5,-7.2,48.5,...,9.8,5,,2013-01-05T07:20:18,2013-01-05T16:43:06,0.79,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,"72505394728,KEWR,KLGA,72502014734,KNYC,7250301..."


In [57]:
def preprocess_data_merge_weather(df):
    # Преобразуем datetime в формат datetime
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])
    
    # Объединяем данные о поездках с данными о погоде по дате
    df_weather_data = pd.merge(
        df,
        weather_data,
        left_on=df['pickup_datetime'].dt.date,
        right_on=weather_data['datetime'].dt.date,
        how='left'
    )
    
    # Заполнение пропусков средними значениями
    weather_columns = ['temp', 'humidity', 'precip', 'windspeed']
    for col in weather_columns:
        df_weather_data[col].fillna(df_weather_data[col].mean(), inplace=True)
        
    # Добавление бинарного признака наличия данных о погоде
    df_weather_data['weather_data_available'] = df_weather_data['temp'].notna().astype(int)
    
    # Создание признаков времени суток и дня недели
    df_weather_data['hour'] = df_weather_data['pickup_datetime'].dt.hour
    df_weather_data['day_of_week'] = df_weather_data['pickup_datetime'].dt.dayofweek

    return df_weather_data


train_data = preprocess_data_merge_weather(train_data)
test_data = preprocess_data_merge_weather(test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_weather_data[col].fillna(df_weather_data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_weather_data[col].fillna(df_weather_data[col].mean(), inplace=True)


In [61]:
from geopy.distance import geodesic
from pandas.tseries.holiday import USFederalHolidayCalendar

def preprocess_data(df):
    # Удаляем некорректные координаты
    df = df[
        (df['pickup_latitude'].between(-90, 90)) & 
        (df['dropoff_latitude'].between(-90, 90)) & 
        (df['pickup_longitude'].between(-180, 180)) & 
        (df['dropoff_longitude'].between(-180, 180))
    ].dropna(subset=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'])

    # Преобразуем координаты в float
    df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']] = \
        df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].astype(float)

    # Функция Haversine distance
    def haversine_distance(lat1, lon1, lat2, lon2):
        R = 6371
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
        return 2 * R * np.arcsin(np.sqrt(a))

    df['haversine_distance'] = haversine_distance(
        df['pickup_latitude'], df['pickup_longitude'],
        df['dropoff_latitude'], df['dropoff_longitude']
    )

    # Manhattan distance
    df['manhattan_distance'] = (
        abs(df['pickup_latitude'] - df['dropoff_latitude']) +
        abs(df['pickup_longitude'] - df['dropoff_longitude'])
    ) * 111

    # Признак направления движения (bearing)
    def bearing(lat1, lon1, lat2, lon2):
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        x = np.sin(dlon) * np.cos(lat2)
        y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
        return np.degrees(np.arctan2(x, y))

    df['bearing'] = bearing(
        df['pickup_latitude'], df['pickup_longitude'],
        df['dropoff_latitude'], df['dropoff_longitude']
    )

    # Координаты аэропортов и центра города
    JFK = (40.6413, -73.7781)
    LGA = (40.7769, -73.8740)
    EWR = (40.6895, -74.1745)
    DOWNTOWN = (40.7580, -73.9855)

    def is_near_location(lat, lon, location, threshold=1):
        return int(geodesic((lat, lon), location).km < threshold)

    df['pickup_near_airport'] = df.apply(
        lambda row: is_near_location(row['pickup_latitude'], row['pickup_longitude'], JFK) or
                    is_near_location(row['pickup_latitude'], row['pickup_longitude'], LGA) or
                    is_near_location(row['pickup_latitude'], row['pickup_longitude'], EWR), axis=1
    )

    df['dropoff_near_airport'] = df.apply(
        lambda row: is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], JFK) or
                    is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], LGA) or
                    is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], EWR), axis=1
    )

    df['pickup_near_downtown'] = df.apply(
        lambda row: is_near_location(row['pickup_latitude'], row['pickup_longitude'], DOWNTOWN), axis=1
    )

    df['dropoff_near_downtown'] = df.apply(
        lambda row: is_near_location(row['dropoff_latitude'], row['dropoff_longitude'], DOWNTOWN), axis=1
    )

    # Извлекаем временные признаки (если есть столбец 'pickup_datetime')
    if 'pickup_datetime' in df.columns:
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
        df['hour'] = df['pickup_datetime'].dt.hour
        df['day_of_week'] = df['pickup_datetime'].dt.weekday
        df['month'] = df['pickup_datetime'].dt.month
        
        df['minute'] = df['pickup_datetime'].dt.minute
        df['day_of_month'] = df['pickup_datetime'].dt.day
        df['year'] = df['pickup_datetime'].dt.year
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['is_night_trip'] = ((df['hour'] >= 23) | (df['hour'] <= 5)).astype(int)
        df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 16) & (df['hour'] <= 19))
        df['season'] = df['month'] % 12 // 3 + 1
        
        holidays = USFederalHolidayCalendar().holidays(start='2009-01-01', end='2015-12-31')
        df['is_holiday'] = df['pickup_datetime'].dt.date.isin(holidays).astype(int)

    return df


train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [59]:
# Преобразуем время в формат количества секунд с начала эпохи или как разницу между временем и базовой датой (например, 2000-01-01)
train_data['sunrise'] = pd.to_datetime(train_data['sunrise'], errors='coerce')
train_data['sunset'] = pd.to_datetime(train_data['sunset'], errors='coerce')

test_data['sunrise'] = pd.to_datetime(test_data['sunrise'], errors='coerce')
test_data['sunset'] = pd.to_datetime(test_data['sunset'], errors='coerce')

# Преобразуем в количество секунд с начала эпохи или от базовой даты (например, 2000-01-01)
train_data['sunrise_seconds'] = (train_data['sunrise'] - pd.Timestamp("2000-01-01")) // pd.Timedelta(seconds=1)
train_data['sunset_seconds'] = (train_data['sunset'] - pd.Timestamp("2000-01-01")) // pd.Timedelta(seconds=1)

test_data['sunrise_seconds'] = (test_data['sunrise'] - pd.Timestamp("2000-01-01")) // pd.Timedelta(seconds=1)
test_data['sunset_seconds'] = (test_data['sunset'] - pd.Timestamp("2000-01-01")) // pd.Timedelta(seconds=1)

# Убедимся, что у нас больше нет строковых значений в данных, где ожидаются числовые значения
train_data = train_data.drop(columns=['sunrise', 'sunset'])
test_data = test_data.drop(columns=['sunrise', 'sunset'])

# Теперь в features должны быть указаны правильные столбцы
features = [
    # Погодные признаки
    'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin', 'feelslike', 
    'dew', 'humidity', 'precip', 'precipprob', 'precipcover', 'preciptype', 
    'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir', 'sealevelpressure', 
    'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex', 
    'severerisk', 'sunrise_seconds', 'sunset_seconds',
    
    # Прочие признаки
    'haversine_distance', 'manhattan_distance', 'bearing', 'pickup_near_airport', 
    'dropoff_near_airport', 'pickup_near_downtown', 'dropoff_near_downtown', 
    'hour', 'day_of_week', 'month', 'minute', 'day_of_month', 'year', 
    'is_weekend', 'is_night_trip', 'is_rush_hour', 'season'
]

# Подготовка данных для обучения
X = train_data[features]
y = train_data['fare_amount']

# Разделяем данные на тренировочные и тестовые
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучаем модель CatBoost, передавая индексы категориальных признаков
from catboost import CatBoostRegressor

categorical_features = ['preciptype', 'severerisk', 'pickup_near_airport', 'dropoff_near_airport', 'pickup_near_downtown', 'dropoff_near_downtown', 'is_weekend', 'is_night_trip', 'is_rush_hour', 'season']

# Заполняем пропущенные значения в категориальных признаках строкой "unknown"
# for col in categorical_features:
#     train_data[col] = train_data[col].fillna("unknown")
#     test_data[col] = test_data[col].fillna("unknown")

for col in categorical_features:
    # Заменяем NaN значениями в категориальных признаках на 'unknown'
    train_data[col] = train_data[col].fillna("unknown")
    test_data[col] = test_data[col].fillna("unknown")

# Убедитесь, что категориальные признаки в данных имеют тип "category"
for col in categorical_features:
    train_data[col] = train_data[col].astype('category')
    test_data[col] = test_data[col].astype('category')


# Заполняем пропущенные значения в числовых признаках (например, на медиану или среднее)
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns.drop('fare_amount')
train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())
test_data[numeric_columns] = test_data[numeric_columns].fillna(test_data[numeric_columns].median())


model = CatBoostRegressor(iterations=1000, depth=7, learning_rate=0.1, loss_function='RMSE', verbose=200, cat_features=categorical_features)
model.fit(X_train, y_train, cat_features=[X.columns.get_loc(col) for col in categorical_features])

# Оцениваем модель
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

# Применяем модель к тестовым данным
X_test = test_data[features]
test_data['fare_amount'] = model.predict(X_test)

# Сохраняем предсказания
submission = test_data[['key', 'fare_amount']]
submission.to_csv("submission.csv", index=False)

print("Предсказания сохранены в submission.csv!")


CatBoostError: categorical features in the model are set to ['preciptype', 'severerisk', 'pickup_near_airport', 'dropoff_near_airport', 'pickup_near_downtown', 'dropoff_near_downtown', 'is_weekend', 'is_night_trip', 'is_rush_hour', 'season']. categorical features passed to fit function are set to [11, 23, 29, 30, 31, 32, 39, 40, 41, 42]

In [32]:
features = [
        # Погодные признаки
    'tempmax',              # Максимальная температура
    'tempmin',              # Минимальная температура
    'temp',                 # Средняя температура
    'feelslikemax',         # Максимальная температура по ощущениям
    'feelslikemin',         # Минимальная температура по ощущениям
    'feelslike',            # Средняя температура по ощущениям
    'dew',                  # Точка росы
    'humidity',             # Влажность воздуха
    'precip',               # Количество осадков
    'precipprob',           # Вероятность осадков
    'precipcover',          # Покрытие осадками
    'preciptype',           # Тип осадков (дождь, снег и т.д.)
    'snow',                 # Количество снега
    'snowdepth',            # Глубина снега
    'windgust',             # Порывы ветра
    'windspeed',            # Скорость ветра
    'winddir',              # Направление ветра
    'sealevelpressure',     # Атмосферное давление
    'cloudcover',           # Покрытие облаками
    'visibility',           # Видимость
    'solarradiation',       # Солнечное излучение
    'solarenergy',          # Солнечная энергия
    'uvindex',              # Индекс ультрафиолетового излучения
    'severerisk',           # Оценка риска катастроф
    'sunrise',              # Время восхода солнца
    'sunset',               # Время заката солнца
    'moonphase',            # Фаза Луны

    
    'haversine_distance', 'manhattan_distance', 'bearing',
    'pickup_near_airport', 'dropoff_near_airport',
    'pickup_near_downtown', 'dropoff_near_downtown',
    'hour', 'day_of_week', 'month', 'minute', 'day_of_month', 
    'year', 'is_weekend', 'is_night_trip', 'is_rush_hour', 'season'
]



from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Задаем категориальные признаки (например, 'preciptype')
categorical_features = [
    'preciptype',  # Тип осадков (дождь, снег и т.д.)
    'season'       # Сезон (например, весна, лето, осень, зима)
]

for col in categorical_features:
    train_data[col] = train_data[col].fillna('missing')
    test_data[col] = test_data[col].fillna('missing')


# Преобразуем время в формат количества секунд с начала эпохи или как разницу между временем и базовой датой
train_data['sunrise'] = pd.to_datetime(train_data['sunrise'], errors='coerce')
train_data['sunset'] = pd.to_datetime(train_data['sunset'], errors='coerce')

test_data['sunrise'] = pd.to_datetime(test_data['sunrise'], errors='coerce')
test_data['sunset'] = pd.to_datetime(test_data['sunset'], errors='coerce')

# Преобразуем в количество секунд с начала эпохи или от базовой даты (например, 2000-01-01)
train_data['sunrise_seconds'] = (train_data['sunrise'] - pd.Timestamp("2000-01-01")) // pd.Timedelta(seconds=1)
train_data['sunset_seconds'] = (train_data['sunset'] - pd.Timestamp("2000-01-01")) // pd.Timedelta(seconds=1)

test_data['sunrise_seconds'] = (test_data['sunrise'] - pd.Timestamp("2000-01-01")) // pd.Timedelta(seconds=1)
test_data['sunset_seconds'] = (test_data['sunset'] - pd.Timestamp("2000-01-01")) // pd.Timedelta(seconds=1)

# Убедимся, что у нас больше нет строковых значений в данных, где ожидаются числовые значения
train_data = train_data.drop(columns=['sunrise', 'sunset'])
test_data = test_data.drop(columns=['sunrise', 'sunset'])

# Добавляем эти новые признаки в список признаков
features.append('sunrise_seconds')
features.append('sunset_seconds')

# Подготовка данных для обучения
X = train_data[features]
y = train_data['fare_amount']

# Разделяем данные на тренировочные и тестовые
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучаем модель CatBoost, передавая индексы категориальных признаков
model = CatBoostRegressor(iterations=1000, depth=7, learning_rate=0.1, loss_function='RMSE', verbose=200)
model.fit(X_train, y_train, cat_features=[X.columns.get_loc(col) for col in categorical_features])

# Оцениваем модель
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

# Применяем модель к тестовым данным
X_test = test_data[features]
test_data['fare_amount'] = model.predict(X_test)

# Сохраняем предсказания
submission = test_data[['key', 'fare_amount']]
submission.to_csv("submission.csv", index=False)

print("Предсказания сохранены в submission.csv!")


KeyError: "['sunrise', 'sunset'] not in index"

In [62]:
features = [
    'haversine_distance', 'manhattan_distance', 'bearing',
    'pickup_near_airport', 'dropoff_near_airport',
    'pickup_near_downtown', 'dropoff_near_downtown',
    'hour', 'day_of_week', 'month', 'minute', 'day_of_month', 
    'year', 'is_weekend', 'is_night_trip', 'is_rush_hour', 'season'
]

X = train_data[features]
y = train_data['fare_amount']


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучаем модель CatBoost
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=1000, depth=7, learning_rate=0.1, loss_function='RMSE', verbose=200)
model.fit(X_train, y_train)

# Оцениваем модель
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

X_test = test_data[features]

# Делаем предсказание
test_data['fare_amount'] = model.predict(X_test)

# Сохраняем результат
submission = test_data[['key', 'fare_amount']]
submission.to_csv("submission.csv", index=False)

print("Предсказания сохранены в submission.csv!")

0:	learn: 9.0954648	total: 21.5ms	remaining: 21.4s
200:	learn: 4.2027676	total: 1.39s	remaining: 5.54s
400:	learn: 3.8952039	total: 2.64s	remaining: 3.94s
600:	learn: 3.6390415	total: 3.79s	remaining: 2.52s
800:	learn: 3.4620147	total: 4.99s	remaining: 1.24s
999:	learn: 3.3245413	total: 6.15s	remaining: 0us
RMSE: 4.526683273634654
Предсказания сохранены в submission.csv!




In [16]:
'trip_duration' in train_data.columns

False