In [137]:
import pandas as pd


df = pd.read_csv("../../data/cleaned_data.csv")

# Tas bort då jag märkte att denna inte påverkar priset alls
df = df.drop(columns="Day_of_Week")
df = df.dropna(subset=["Weather"])

# DUMMY ENCODING

df = df_encoded = pd.get_dummies(
    df,
    columns=['Time_of_Day', "Weather"],
    dtype=int
)

df.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Weather_Clear,Weather_Rain,Weather_Snow
0,19.35,3.56,0.8,0.32,53.82,36.2624,0,0,1,0,1,0,0
1,36.87,2.7,1.21,0.15,37.27,52.9032,0,1,0,0,1,0,0
3,8.64,2.93,0.63,0.32,22.64,15.618,0,1,0,0,1,0,0
4,8.64,2.55,1.71,0.48,89.33,60.2028,1,0,0,0,1,0,0
5,3.85,3.51,1.66,0.27,5.05,11.2645,1,0,0,0,0,1,0


In [138]:
df.isnull().sum()

Trip_Distance_km         0
Base_Fare                0
Per_Km_Rate              0
Per_Minute_Rate          0
Trip_Duration_Minutes    0
Trip_Price               0
Time_of_Day_Afternoon    0
Time_of_Day_Evening      0
Time_of_Day_Morning      0
Time_of_Day_Night        0
Weather_Clear            0
Weather_Rain             0
Weather_Snow             0
dtype: int64

In [139]:
df.describe()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Weather_Clear,Weather_Rain,Weather_Snow
count,808.0,808.0,808.0,808.0,808.0,808.0,808.0,808.0,808.0,808.0,808.0,808.0,808.0
mean,27.133094,3.475111,1.238041,0.289963,62.3775,56.860779,0.394802,0.215347,0.288366,0.101485,0.701733,0.232673,0.065594
std,19.863839,0.876148,0.428982,0.114727,31.889713,40.091755,0.489111,0.411317,0.453283,0.302157,0.457781,0.422797,0.247725
min,1.23,2.01,0.5,0.1,5.01,6.1269,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12.8575,2.71,0.86,0.19,36.46,34.295725,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.895,3.485,1.22,0.28,62.015,50.3725,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,38.485,4.21,1.62,0.38,88.375,69.48505,1.0,0.0,1.0,0.0,1.0,0.0,0.0
max,146.067047,5.0,2.0,0.5,119.84,332.043689,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [140]:
df.shape

(808, 13)

In [141]:
X, y = df.drop("Trip_Price", axis="columns"), df["Trip_Price"]
X.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Weather_Clear,Weather_Rain,Weather_Snow
0,19.35,3.56,0.8,0.32,53.82,0,0,1,0,1,0,0
1,36.87,2.7,1.21,0.15,37.27,0,1,0,0,1,0,0
3,8.64,2.93,0.63,0.32,22.64,0,1,0,0,1,0,0
4,8.64,2.55,1.71,0.48,89.33,1,0,0,0,1,0,0
5,3.85,3.51,1.66,0.27,5.05,1,0,0,0,0,1,0


In [142]:
y.head()

0    36.2624
1    52.9032
3    15.6180
4    60.2028
5    11.2645
Name: Trip_Price, dtype: float64

## Scikit-learn steps

1. Train|test split or train|val|test split
2. Scale dataset
    - many algorithms require scaling, some dont
        - different types of scaling exists (e.g. feature standardization, min-max scaling)
            - scale training data and test data to the training datas parameters to avoid data leakage
            3. Fit algorithm to train data
            4. Predict on test data
            5. Evaluation metrics

In [143]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

In [144]:
print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_test.shape = }")

X_train.shape = (541, 12)
y_train.shape = (541,)
X_test.shape = (267, 12)
y_test.shape = (267,)


In [145]:
X_train.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Weather_Clear,Weather_Rain,Weather_Snow
791,43.82,2.19,1.48,0.15,42.17,1,0,0,0,1,0,0
565,49.14,3.58,1.21,0.22,27.99,1,0,0,0,0,1,0
638,2.64,4.53,1.71,0.43,98.23,0,0,1,0,0,0,1
20,9.36,2.4,1.85,0.15,7.07,1,0,0,0,1,0,0
611,40.91,4.73,1.16,0.28,19.1,1,0,0,0,1,0,0


In [146]:
y_train.head()

791    73.3691
565    69.1972
638    51.2833
20     20.7765
611    57.5336
Name: Trip_Price, dtype: float64

In [147]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
type(scaler)

sklearn.preprocessing._data.MinMaxScaler

In [148]:
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")

scaled_X_train.min() = np.float64(0.0)
scaled_X_train.max() = np.float64(1.0)


In [149]:
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")

scaled_X_test.min() = np.float64(-0.0015212591068661761)
scaled_X_test.max() = np.float64(1.0000000000000002)


In [150]:
scaled_X_train.shape

(541, 12)

In [151]:
scaled_X_train[:5]

array([[0.29298067, 0.06020067, 0.65333333, 0.125     , 0.32337312,
        1.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        ],
       [0.32976749, 0.52508361, 0.47333333, 0.3       , 0.19984319,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        ],
       [0.00822863, 0.84280936, 0.80666667, 0.825     , 0.81174318,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 1.        ],
       [0.05469618, 0.13043478, 0.9       , 0.125     , 0.01759735,
        1.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        ],
       [0.27285857, 0.909699  , 0.44      , 0.45      , 0.12239742,
        1.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        ]])

In [152]:
from sklearn.linear_model import LinearRegression

#instansiate an instance from LineareRegression class
model = LinearRegression()
model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [153]:
model.fit(scaled_X_train, y_train)
print(f"Parameters or weights: {model.coef_}")
print(f"Intercept: {model.intercept_}")

Parameters or weights: [ 2.58866030e+02 -1.01362422e+00  3.94887783e+01  2.39863546e+01
  3.46227737e+01  5.72370987e-01 -9.56379135e-01  7.50676227e-01
 -3.66668079e-01  1.50549416e+00 -1.25643465e+00 -2.49059508e-01]
Intercept: -37.26744112340907


Ju högre vikter - desto mer påverkar det priset

Itercept-värdet: -37 om alla andra värden är noll. 

In [154]:
feature_names = X_train.columns
coefficients = model.coef_

coef_df = pd.DataFrame(
    {'Feature': feature_names, 'Coefficient': coefficients}
)

print(coef_df)

                  Feature  Coefficient
0        Trip_Distance_km   258.866030
1               Base_Fare    -1.013624
2             Per_Km_Rate    39.488778
3         Per_Minute_Rate    23.986355
4   Trip_Duration_Minutes    34.622774
5   Time_of_Day_Afternoon     0.572371
6     Time_of_Day_Evening    -0.956379
7     Time_of_Day_Morning     0.750676
8       Time_of_Day_Night    -0.366668
9           Weather_Clear     1.505494
10           Weather_Rain    -1.256435
11           Weather_Snow    -0.249060


### Här ser jag att day of weekend eller weekday inte påverkar priset alls, därför tas dessa bort nu.

## PREDICTION

In [155]:
X_test.iloc[0]

Trip_Distance_km          9.91
Base_Fare                 2.49
Per_Km_Rate               1.37
Per_Minute_Rate           0.42
Trip_Duration_Minutes    97.64
Time_of_Day_Afternoon     0.00
Time_of_Day_Evening       0.00
Time_of_Day_Morning       1.00
Time_of_Day_Night         0.00
Weather_Clear             1.00
Weather_Rain              0.00
Weather_Snow              0.00
Name: 725, dtype: float64

In [156]:
sample_feature = scaled_X_test[0].reshape(1, -1)
sample_feature

array([[0.05849933, 0.16053512, 0.58      , 0.8       , 0.80660336,
        0.        , 0.        , 1.        , 0.        , 1.        ,
        0.        , 0.        ]])

In [157]:
model.predict(sample_feature)

array([49.98891642])

In [158]:
y_test.iloc[0]

np.float64(57.075500000000005)

## I FÖRSTA ANBLICK SUPERKASST :)

### Nu predicta på hela test-settet

In [159]:
y_pred = model.predict(scaled_X_test)
y_pred[:5]

array([ 49.98891642, -26.06935003,  25.62634569,   5.9579831 ,
        16.46578904])

In [160]:
y_test.iloc[:5]

725    57.0755
307     6.1269
237    33.4311
347    19.8829
558    24.6913
Name: Trip_Price, dtype: float64

### Mycket bättre resultat

## EVALUATE

- mae - mean absolute error
- mse - mean squared error
- rmse - root mean sqared error

KOLLAR GENOMSNITTLIGA FEL

In [161]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"{mae = }")
print(f"{mse = }")
print(f"{rmse = }")

mae = 8.869750480367594
mse = 141.41828132084666
rmse = np.float64(11.891941865012907)


### Efter att jag lade till weather så blev det genomsnittliga felet lägre.