In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [3]:
data = pd.read_parquet('../data/yellow_tripdata_2023-01.parquet')

In [12]:
data.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [13]:
num_columns = len(data.columns)
num_columns

19

In [4]:

# Pickup ve dropoff zamanlarını datetime formatına çevir
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])

# Süreyi hesapla (dakika cinsinden)
data['duration'] = (data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']).dt.total_seconds() / 60

# Standart sapmayı hesapla
std_duration = data['duration'].std()
std_duration

42.59435124195458

In [5]:
# Orijinal kayıt sayısı
original_count = len(data)

# 1 ile 60 dakika arasındaki süreleri filtrele
filtered_data = data[(data['duration'] >= 1) & (data['duration'] <= 60)]

# Kalan kayıt sayısı
filtered_count = len(filtered_data)

# Kalan kayıtların oranı
fraction_remaining = filtered_count / original_count
fraction_remaining

0.9812202822125979

In [6]:

# Kategorik kolonlar
categorical = ['PULocationID', 'DOLocationID']

# Pickup ve dropoff location ID'lerini string'e çevir ve eksik değerleri doldur
filtered_data[categorical] = filtered_data[categorical].fillna(-1).astype('int')
filtered_data[categorical] = filtered_data[categorical].astype('str')

# Veriyi sözlükler listesine dönüştür
train_dicts = filtered_data[categorical].to_dict(orient='records')

# Bellek kullanımı optimize etmek için DictVectorizer'da sparse=True kullan
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Elde edilen matrisin boyutunu belirle
dimensionality = X_train.shape[1]
dimensionality

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data[categorical] = filtered_data[categorical].fillna(-1).astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data[categorical] = filtered_data[categorical].astype('str')


515

In [7]:

# Hedef değişkeni oluştur
y_train = filtered_data['duration'].values

# Lineer regresyon modelini eğit
model = LinearRegression()
model.fit(X_train, y_train)

# Eğitim verisi üzerinde tahminlerde bulun
y_pred_train = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_train



7.649261932106969

In [8]:
!wget -P ../data https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

--2024-05-24 19:50:52--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.190.224, 18.239.190.168, 18.239.190.192, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.190.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47748012 (46M) [application/x-www-form-urlencoded]
Saving to: ‘../data/yellow_tripdata_2023-02.parquet.1’


2024-05-24 19:50:53 (71.0 MB/s) - ‘../data/yellow_tripdata_2023-02.parquet.1’ saved [47748012/47748012]



In [None]:
# Read the validation data
data_feb = pd.read_parquet('../data/yellow_tripdata_2023-02.parquet')

# Convert pickup and dropoff times to datetime
data_feb['tpep_pickup_datetime'] = pd.to_datetime(data_feb['tpep_pickup_datetime'])
data_feb['tpep_dropoff_datetime'] = pd.to_datetime(data_feb['tpep_dropoff_datetime'])

# Calculate duration in minutes
data_feb['duration'] = (data_feb['tpep_dropoff_datetime'] - data_feb['tpep_pickup_datetime']).dt.total_seconds() / 60

# Filter trips with duration between 1 and 60 minutes
filtered_data_feb = data_feb[(data_feb['duration'] >= 1) & (data_feb['duration'] <= 60)].copy()

# Fill missing values and convert to string
filtered_data_feb[categorical] = filtered_data_feb[categorical].fillna(-1).astype('int')
filtered_data_feb[categorical] = filtered_data_feb[categorical].astype('str')

# Convert to list of dictionaries
val_dicts = filtered_data_feb[categorical].to_dict(orient='records')

# Get the feature matrix for the validation data
X_val = dv.transform(val_dicts)

# Predict and calculate RMSE on the validation data
y_pred_val = model.predict(X_val)
rmse_val = mean_squared_error(filtered_data_feb['duration'], y_pred_val, squared=False)
rmse_val