In [32]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import os
from sklearn.metrics import roc_auc_score, roc_curve
#import xgboost
import seaborn as sns
from geopy.distance import vincenty
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [33]:
data_raw = pd.read_csv('taxi.csv')

In [34]:
data_raw.head(5)

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,-1.0,-1.0,Economy,private,0
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,1
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,9.8,Economy,private,0
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,-1.0,-1.0,Economy,private,1
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,19.25,Economy,private,1


**offer_gk** – unique offer identifier (INT)<br>
**weekday_key** – day of week number (Sunday = 0, Monday = 1, etc.) (INT)<br>
**hour_key** – hour of day representing an hour part of datetime (value from 0 to 23) (INT)<br>
**driver_gk** – unique driver identifier (INT)<br>
**order_gk** – unique order identifier (INT). Order may have multiple offers<br>
**driver_latitude** – latitude of driver at the time of getting an offer (FLOAT)<br> 
**driver_longitude** – longitude of driver at the time of receiving an offer (FLOAT)<br>
**origin_order_latitude** – latitude of the order start location at the time of receiving an offer (FLOAT)<br>
**origin_order_longitude** – longitude of the order start location at the moment of receiving an offer (FLOAT)<br>
**distance_km** – estimated distance from origin to destination in kilometres (FLOAT). Value -1 means that the destination is not set<br>
**duration_min** – estimated duration from origin to destination in minutes (FLOAT). Value -1 means that the destination is not set<br>
**offer_class_group** – class of the order, e.g. Economy, Business, XL (VARCHAR)<br>
**ride_type_desc** – private or business order attribute (VARCHAR)<br>
**driver_response** – driver choice of whether to accept the offer or not (VARCHAR)<br>

In [35]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
offer_gk                  100000 non-null int64
weekday_key               100000 non-null int64
hour_key                  100000 non-null int64
driver_gk                 100000 non-null int64
order_gk                  100000 non-null int64
driver_latitude           100000 non-null float64
driver_longitude          100000 non-null float64
origin_order_latitude     100000 non-null float64
origin_order_longitude    100000 non-null float64
distance_km               100000 non-null float64
duration_min              100000 non-null float64
offer_class_group         100000 non-null object
ride_type_desc            100000 non-null object
driver_response           100000 non-null int64
dtypes: float64(6), int64(6), object(2)
memory usage: 10.7+ MB


### preprocessing and feature engineering

#### get one hot enc

In [36]:
def get_one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    return df

#### target encoding

In [37]:
def get_target_share(df_train, df_test, col, target_col):
    
    category_val_dict = (
        df_train[df_train[target_col] == 1][col].value_counts()
        /
        df_train[col].value_counts()
    ).to_dict()
    
    
    df_train[col + '_counts'] = df_train[col].apply(category_val_dict.get)
    df_test[col + '_counts'] = df_test[col].apply(category_val_dict.get)
    return df_train, df_test

#### coordinates

In [38]:
def get_woe_v1(df_train, df_test, col, target_col):
    all_good = len(df_train[df_train[target_col] == 1][col])
    all_bad = len(df_train[df_train[target_col] == 0][col])
    odds_series = (
        df_train[df_train[target_col] == 1][col].value_counts()
        /
        df_train[df_train[target_col] == 0][col].value_counts()
    )
    odds_series = odds_series / all_good * all_bad
    category_woe_dict = np.log(odds_series).to_dict()
    df_train[col + '_woe'] = df_train[col].apply(category_woe_dict.get)
    df_test[col + '_woe'] = df_test[col].apply(category_woe_dict.get)
    return df_train, df_test

In [39]:
def get_woe_stat(df_train, col, target_col):
    stat = df_train.groupby(col)[target_col].agg(
        [np.mean, np.size, np.count_nonzero], sort=False
    )
    stat.rename(columns={'count_nonzero': 'good', 'size': 'obs', 'mean': 'event_rate'}, inplace=True)
    stat['all_good'] = stat['good'].sum()
    stat['p_good'] = stat['good'] / stat['all_good']
    stat['bad'] = stat['obs'] - stat['good']
    stat['all_bad'] = stat['bad'].sum()
    stat['p_bad'] = stat['bad'] / stat['all_bad']
    stat['WOE'] = np.log(stat['p_good'] / stat['p_bad'])
    stat['IV'] = ((stat['p_good'] - stat['p_bad']) * stat['WOE']).sum()
    stat['variable'] = col
    stat['category'] = stat.index
    stat.reset_index(inplace=True)
    stat.drop(labels=col, inplace=True, axis=1)
    return stat

In [40]:
# create categories based on the part of the day
def process_hour(hour):
    if 6 <= hour < 12:
        return 0
    elif 12 <= hour < 18:
        return 1
    elif 18 <= hour < 24:
        return 2
    elif 0 <= hour < 6:
        return 3

In [41]:
# create "is weekend?" feature
def process_day(day):
    if day <= 4:
        return 0
    else:
        return 1

In [42]:
# get vincenty distance between two points
def compute_dist_in_km(coords1, coords2):
    return vincenty(coords1, coords2).kilometers

In [43]:
# calculate ride cost based on the plan
def compute_taxi_cost(row):
    if row.offer_class_group == "Economy":
        return 97 + 7 * row.distance_km + 7 * row.duration_min
    elif row.offer_class_group == "Standard":
        return 150 + 11 * row.distance_km + 13 * row.duration_min
    elif row.offer_class_group == "Premium":
        return 299 + 15 * row.distance_km + 20 * row.duration_min
    elif row.offer_class_group == "Kids":
        return 99 + 22 * row.duration_min
    elif row.offer_class_group == "VIP":
        return 699 + 55 * row.distance_km + 25 * row.duration_min
    elif row.offer_class_group == "XL":
        return 299 + 15 * row.distance_km + 25 * row.duration_min
    else:
        return -1

In [44]:
airports_coordinates = [
    ("Domodedovo", (55.415832, 37.896903), 3), # Domodedovo
    ("Sheremetyevo", (55.972323, 37.412804), 3), # Sheremetyevo
    ("Vnukovo", (55.598791, 37.269157), 3), # Vnukovo
    ("Zhukovskiy", (55.561805, 38.118139), 3), # Zhukovskiy
    ("Moscow_center", (55.753742, 37.621852), 5), # center
]

In [45]:
# calculate distance from driver to airport and from origin to airport
def compute_airport_driver_dist(data):
    for airport, (air_latitude, air_longitude), radius in airports_coordinates:
        print(airport)

        dist_column_name = 'driver_dist_to_{}'.format(airport)
        radius_column_name = 'driver_near_to_{}'.format(airport)
        data[dist_column_name] = data.apply(
            lambda row: compute_dist_in_km(
                (row['driver_latitude'], row['driver_longitude'], (air_latitude, air_longitude))
            ),
            axis=1
        )
        data[radius_column_name] = data[dist_column_name].apply(lambda x: 1 if x <= radius else 0)
        # print(radius, len(dummies_df[dummies_df[radius_column_name] == 1]))


        dist_column_name = 'origin_order_dist_to_{}'.format(airport)
        radius_column_name = 'origin_order_near_to_{}'.format(airport)
        data[dist_column_name] = data.apply(
            lambda row: compute_dist_in_km(
                (row['origin_order_latitude'], row['origin_order_longitude']), (air_latitude, air_longitude)
            ),
            axis=1
        )
        data[radius_column_name] = data[dist_column_name].apply(lambda x: 1 if x <= radius else 0)
        # print(radius, len(dummies_df[dummies_df[radius_column_name] == 1]))
    return data

In [46]:
def build_new_columns(data):
    #создание пары координат
    origin_coordindates = data[['origin_order_latitude', 'origin_order_longitude']
    ].apply(lambda x: str(np.round(x[0], 2)) + ' ' + str(np.round(x[1], 2)), axis=1)
    
    driver_coordinates = data[['driver_latitude', 'driver_longitude']
    ].apply(lambda x: str(np.round(x[0], 2)) + ' ' + str(np.round(x[1], 2)), axis=1)
    
    data['origin_coordinates'] = origin_coordindates
    data['driver_coordinates'] = driver_coordinates
    
    
    #get one hot
    data = get_one_hot(
        data,
        ['offer_class_group', 'ride_type_desc', 'weekday_key', 'hour_key']
    )
    
    
    #часть дня - часть недели
    data['day_part'] = data.apply(lambda row: process_hour(row.hour_key), axis=1)
    data['week_part'] = data.apply(lambda row: process_day(row.hour_key), axis=1)
    
    data['driver_response_by_day_and_part'] = data[
        ['weekday_key', 'hour_key']
    ].apply(lambda x: str(x[0]) + '_' + str(x[1]), axis=1)

    #расстрояние грубо
    data['driver_to_offer_dist'] = data.apply(lambda row: compute_dist_in_km(
            (row.driver_latitude, row.driver_longitude),
            (row.origin_order_latitude, row.origin_order_longitude)
        ), axis=1)
    
    #цена поездки
    data['taxi_cost'] = data.apply(lambda row: compute_taxi_cost(row), axis=1)
    
    #не забываем удалить лишнее - мы их уже закодировали
    data = data.drop(
        labels=['offer_class_group', 'ride_type_desc', 'weekday_key', 'hour_key'],
        axis=1
    )
    
    
    data['duration_min_rounded'] = data['duration_min'].apply(lambda x: np.round(x, 0))
    
    data['is_address_submitted'] = data['duration_min'].apply(lambda x: x != -1)
    
    order_dict = data.order_gk.value_counts().to_dict()
    data['amount_of_offers'] = data['order_gk'].apply(order_dict.get)
    
    #средняя скорость по оценке приложения
    data['traffic'] = data[['distance_km', 'duration_min']].apply(
        lambda x: x[0] * 1. / x[1] if (x[1] != -1) and (x[0] != -1) else -1,
        axis=1
    )
    
    return data

In [47]:
data_raw.head(5)

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,-1.0,-1.0,Economy,private,0
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,1
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,9.8,Economy,private,0
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,-1.0,-1.0,Economy,private,1
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,19.25,Economy,private,1


In [61]:
y = data_raw['driver_response'].copy()
data_train, data_test, y_train, y_test = train_test_split(data_raw, y, test_size=0.3, random_state=42)

##### problem

In [62]:
data_train = build_new_columns(data_train)
data_test = build_new_columns(data_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
  This is separate from the ipykernel package so we can avoid doing imports until


- get_woe_v1

In [51]:
#Преобразование категориальных
columns_to_get_counts = [
    'driver_gk',
    'origin_coordinates',
    'driver_coordinates',
    'driver_response_by_day_and_part',
    'duration_min_rounded',
]
    
for col_iter in columns_to_get_counts:
    data_train, data_test = get_woe_v1(data_train, data_test, col_iter, 'driver_response')

In [52]:
#Удаление лишних колонок
train_columns = [
    col for col in data_train.columns.tolist()
    if col not in set(
        [
            'offer_gk',
            'order_gk',
            'driver_response',
            'driver_gk',
            'origin_coordinates',
            'driver_coordinates',
            'driver_response_by_day_and_part',
#             'driver_response_by_day_and_part_counts',
            'duration_min_rounded',
        ]
    )
]
x_train = data_train[train_columns]
y_train = data_train['driver_response']
x_test = data_test[train_columns]
y_test = data_test['driver_response']

In [53]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(x_train)
x_train_imp=imp.transform(x_train)
x_test_imp=imp.transform(x_test)

#Создание модели
model = RandomForestClassifier(max_depth=5, n_jobs=-1)

#Обучение модели
model.fit(x_train_imp, y_train)

#Предсказание модели на трейне
y_train_predict = model.predict_proba(x_train_imp)[:, 1]
#Предсказание модели на тесте
y_test_predict = model.predict_proba(x_test_imp)[:, 1]

#Рассчет roc_auc_score для трейна
roc_auc_train = np.round(roc_auc_score(y_train, y_train_predict), 2)
#Рассчет roc_auc_score для теста
roc_auc_test = np.round(roc_auc_score(y_test, y_test_predict), 2)


print("Train: ", roc_auc_train)
print("Test: ", roc_auc_test)



Train:  0.9
Test:  0.89


- get_target_share

In [57]:
#Преобразование категориальных
columns_to_get_counts = [
    'driver_gk',
    'origin_coordinates',
    'driver_coordinates',
    'driver_response_by_day_and_part',
    'duration_min_rounded',
]
    
for col_iter in columns_to_get_counts:
    data_train, data_test = get_target_share(data_train, data_test, col_iter, 'driver_response')

In [58]:
#Удаление лишних колонок
train_columns = [
    col for col in data_train.columns.tolist()
    if col not in set(
        [
            'offer_gk',
            'order_gk',
            'driver_response',
            'driver_gk',
            'origin_coordinates',
            'driver_coordinates',
            'driver_response_by_day_and_part',
#             'driver_response_by_day_and_part_counts',
            'duration_min_rounded',
        ]
    )
]
x_train = data_train[train_columns]
y_train = data_train['driver_response']
x_test = data_test[train_columns]
y_test = data_test['driver_response']

In [59]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(x_train)
x_train_imp=imp.transform(x_train)
x_test_imp=imp.transform(x_test)

#Создание модели
model = RandomForestClassifier(max_depth=5, n_jobs=-1)

#Обучение модели
model.fit(x_train_imp, y_train)

#Предсказание модели на трейне
y_train_predict = model.predict_proba(x_train_imp)[:, 1]
#Предсказание модели на тесте
y_test_predict = model.predict_proba(x_test_imp)[:, 1]

#Рассчет roc_auc_score для трейна
roc_auc_train = np.round(roc_auc_score(y_train, y_train_predict), 2)
#Рассчет roc_auc_score для теста
roc_auc_test = np.round(roc_auc_score(y_test, y_test_predict), 2)


print("Train: ", roc_auc_train)
print("Test: ", roc_auc_test)



Train:  0.89
Test:  0.88


# Домашнее задание