In [2]:
import pandas as pd
import category_encoders as ce
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv(r'D:\DATASETS\taxi_trip_pricing.csv')

In [4]:
for index, row in df.iterrows():
    if row.isnull().sum() > 0:
        df.drop(index, axis=0, inplace=True)

In [5]:
print("Уникальные части суток:", len(df['Time_of_Day'].unique()))
print("Выходные/будние:", len(df['Day_of_Week'].unique()))
print("Виды загруженностей на дорогах:", len(df['Traffic_Conditions'].unique()))
print("Типы погоды:", len(df['Weather'].unique()))

Уникальные части суток: 4
Выходные/будние: 2
Виды загруженностей на дорогах: 3
Типы погоды: 3


In [6]:
encoder = ce.OneHotEncoder(cols=['Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather', 'Passenger_Count'])
df_encoded = encoder.fit_transform(df)
df_encoded

Unnamed: 0,Trip_Distance_km,Time_of_Day_1,Time_of_Day_2,Time_of_Day_3,Time_of_Day_4,Day_of_Week_1,Day_of_Week_2,Passenger_Count_1,Passenger_Count_2,Passenger_Count_3,...,Traffic_Conditions_2,Traffic_Conditions_3,Weather_1,Weather_2,Weather_3,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,1,0,0,0,1,0,1,0,0,...,0,0,1,0,0,3.56,0.80,0.32,53.82,36.2624
2,36.87,0,1,0,0,0,1,0,1,0,...,1,0,1,0,0,2.70,1.21,0.15,37.27,52.9032
5,8.64,0,0,1,0,0,1,0,0,1,...,0,1,1,0,0,2.55,1.71,0.48,89.33,60.2028
12,41.79,0,0,0,1,0,1,1,0,0,...,1,0,1,0,0,4.60,1.77,0.11,86.95,88.1328
14,9.91,0,1,0,0,1,0,0,0,1,...,1,0,1,0,0,2.32,1.26,0.34,41.72,28.9914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,40.17,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,3.81,0.66,0.42,62.66,56.6394
992,14.34,0,0,1,0,1,0,0,1,0,...,0,1,1,0,0,3.23,1.01,0.29,45.07,30.7837
994,18.69,0,1,0,0,1,0,1,0,0,...,0,1,1,0,0,4.90,1.79,0.17,79.41,51.8548
995,5.49,0,0,1,0,0,1,0,0,0,...,0,1,1,0,0,2.39,0.62,0.49,58.39,34.4049


In [7]:
X = df_encoded.drop(['Trip_Price'], axis = 1)
y = df['Trip_Price'].values

X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=42,  
                                   test_size=0.35,  
                                   shuffle=True) 

In [9]:
linear_model_ridgecv = RidgeCV(alphas=[1.0, 1.5, 2.0, 2.5, 3.0, 4, 5], # коэффициенты на подбор 
    fit_intercept=True, # ключение 'b0' в модель (постоянный член)
    scoring=None, # функция оценки модели (по умолчанию r**2)
    cv=None, # Генератор перекрестной проверки или количество заданных инераций кросс-валидации
    gcv_mode='auto', # стратегия перекрестной проверки (auto/svg/eigen)
    store_cv_values=False, # сохранять результаты перекрестной проверки
    alpha_per_target=False, # оптимизировать alpha под каждую цель
)
linear_model_ridgecv.fit(X_train, y_train)



In [10]:
linear_model_ridgecv.coef_

array([ 1.80815511,  2.1910663 , -0.83583582, -0.71698618, -0.6382443 ,
       -0.7927679 ,  0.7927679 , -0.27800094,  0.4549061 , -0.27692223,
        0.10001707, -1.53536656,  3.75296544, -2.21759888, -0.17877915,
       -0.93856436,  1.11734351, -0.26241238, 24.81076879, 49.84893703,
        0.30077726])

In [11]:
predicted_y_train = linear_model_ridgecv.predict(X_train)
predicted_y_test = linear_model_ridgecv.predict(X_test)

In [12]:
mse_train = mean_squared_error(y_train, predicted_y_train)
mse_test = mean_squared_error(y_test, predicted_y_test)
r2_train = r2_score(y_train, predicted_y_train)
r2_test = r2_score(y_test, predicted_y_test)
print("Mean squared error (train): ", mse_train)
print("R2 score (train): ", r2_train)
print("Mean squared error (test): ", mse_test)
print("R2 score (test): ", r2_test)

Mean squared error (train):  210.9008933390065
R2 score (train):  0.8903634407140885
Mean squared error (test):  228.84149635265348
R2 score (test):  0.8819536077653877
