In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [5]:
file_path = './input/train.csv'
home_data = pd.read_csv(file_path, index_col = 0)

print(home_data.head())

           vendor_id      pickup_datetime     dropoff_datetime  \
id                                                               
id2875421          2  2016-03-14 17:24:55  2016-03-14 17:32:30   
id2377394          1  2016-06-12 00:43:35  2016-06-12 00:54:38   
id3858529          2  2016-01-19 11:35:24  2016-01-19 12:10:48   
id3504673          2  2016-04-06 19:32:31  2016-04-06 19:39:40   
id2181028          2  2016-03-26 13:30:55  2016-03-26 13:38:10   

           passenger_count  pickup_longitude  pickup_latitude  \
id                                                              
id2875421                1        -73.982155        40.767937   
id2377394                1        -73.980415        40.738564   
id3858529                1        -73.979027        40.763939   
id3504673                1        -74.010040        40.719971   
id2181028                1        -73.973053        40.793209   

           dropoff_longitude  dropoff_latitude store_and_fwd_flag  \
id          

In [112]:
# 得到两个时间段间的秒数
# import time
# def get_during_time_list(start_time_list, end_time_list):
#     during_time_list = []
    
#     for start_str, end_str in zip(start_time_list, end_time_list):
#         # 字符串转时间戳
#         start_stamp = time.mktime(time.strptime(start_str, "%Y-%m-%d %H:%M:%S"))
#         end_stamp = time.mktime(time.strptime(end_str, "%Y-%m-%d %H:%M:%S"))
#         during_time_list.append(end_stamp - start_stamp)
        
#     return during_time_list

In [113]:
# 数据处理
def process_data(data):
    # 得到时间段 during_time
    # data['during_time'] = get_during_time_list(data['pickup_datetime'], data['dropoff_datetime'])
    
    # 坐标转距离特征 haversine_distance
    data['haversine_distance'] = 2 * 6371 * np.arcsin(np.sqrt(
        np.sin(np.radians(data['dropoff_latitude'] - data['pickup_latitude']) / 2) ** 2 + 
        np.cos(np.radians(data['pickup_latitude'])) * 
        np.cos(np.radians(data['dropoff_latitude'])) * 
        np.sin(np.radians(data['dropoff_longitude'] - data['pickup_longitude']) / 2) ** 2
    ))
    
    # 把 N/Y 置为 0/1
    data['store_and_fwd_flag'] = data['store_and_fwd_flag'].map({'N' : '0', 'Y': '1'})

    return data

In [114]:
process_data(home_data)

print(home_data.head())

          id  vendor_id      pickup_datetime     dropoff_datetime  \
0  id2875421          2  2016-03-14 17:24:55  2016-03-14 17:32:30   
1  id2377394          1  2016-06-12 00:43:35  2016-06-12 00:54:38   
2  id3858529          2  2016-01-19 11:35:24  2016-01-19 12:10:48   
3  id3504673          2  2016-04-06 19:32:31  2016-04-06 19:39:40   
4  id2181028          2  2016-03-26 13:30:55  2016-03-26 13:38:10   

   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0                1        -73.982155        40.767937         -73.964630   
1                1        -73.980415        40.738564         -73.999481   
2                1        -73.979027        40.763939         -74.005333   
3                1        -74.010040        40.719971         -74.012268   
4                1        -73.973053        40.793209         -73.972923   

   dropoff_latitude store_and_fwd_flag  trip_duration  haversine_distance  
0         40.765602                  0            45

In [115]:
# 数据划分
y = home_data['trip_duration']

features = ['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'haversine_distance']
X = home_data[features]

# 划分数据 训练集 + 验证集
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

# 定义随机森林模型
rf_model = RandomForestRegressor(random_state = 1)
rf_model.fit(train_X, train_y)

# 通过验证集求解 MAE平均绝对误差
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print(rf_val_predictions)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

[1632.83 1001.46 1094.56 ...  721.67  554.73 2284.13]
Validation MAE for Random Forest Model: 555


In [116]:
# 全训练数据上的模型
rf_model_on_full_data = RandomForestRegressor(n_estimators = 100, max_depth = 5, random_state = 1)
rf_model_on_full_data.fit(X, y)

In [117]:
# 测试数据路径
test_data_path = './input/test.csv'

# 测试数据
test_data = pd.read_csv(test_data_path)

# 数据处理
test_data = process_data(test_data)

# 预测的输入集
test_X = test_data[features]

#预测
test_preds = rf_model_on_full_data.predict(test_X)

In [163]:
temp_list = [int(pred) for pred in test_preds.tolist()]

# 创建输出 DataFrame
output = pd.DataFrame({
    'id': test_data['id'],
    'trip_duration': temp_list
})

# 保存结果
output.to_csv('submission.csv', index = False)
print("提交文件已生成！")

提交文件已生成！
