# 02 - Polars: Hiz Sampiyonu

Bu notebook'ta Polars'in Pandas'a gore ne kadar hizli oldugunu **NYC Taxi** verisi ile gorecegiz.

**Polars Ozellikleri:**
- Rust ile yazilmis (memory-safe, hizli)
- Multi-threaded execution (tum CPU core'lari)
- Lazy evaluation (query optimization)
- Apache Arrow format (%50 daha az bellek)
- 5-10x daha hizli

**Veri Seti:** NYC Yellow Taxi 2023 (12 ay, ~40M satir)

## 1. Kurulum

In [1]:
# Polars kurulumu
!pip install polars pyarrow -q

In [3]:
import polars as pl
import time
import json
import os
import psutil
import gc
import urllib.request

print(f"Polars version: {pl.__version__}")
print(f"CPU count: {os.cpu_count()}")

Polars version: 1.31.0
CPU count: 12


In [4]:
# Benchmark fonksiyonlari
results = {
    'framework': 'polars',
    'dataset': 'nyc_taxi_12_months',
    'operations': {}
}

def get_memory_mb():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

def benchmark(func, name):
    gc.collect()
    mem_before = get_memory_mb()
    start = time.time()
    result = func()
    end = time.time()
    mem_after = get_memory_mb()
    
    duration = end - start
    mem_used = mem_after - mem_before
    
    results['operations'][name] = {
        'duration_sec': round(duration, 3),
        'memory_mb': round(mem_used, 2)
    }
    
    print(f"\n{'='*50}")
    print(f"Operation: {name}")
    print(f"Sure: {duration:.3f} saniye")
    print(f"Bellek: {mem_used:.2f} MB")
    print(f"{'='*50}")
    
    return result

In [5]:
# Veri indirme - 12 ay
DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)

MONTHS = ['2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06',
          '2023-07', '2023-08', '2023-09', '2023-10', '2023-11', '2023-12']
BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{}.parquet"

taxi_files = []
total_size = 0

for month in MONTHS:
    filename = f"yellow_tripdata_{month}.parquet"
    filepath = os.path.join(DATA_DIR, filename)
    taxi_files.append(filepath)
    
    if not os.path.exists(filepath):
        url = BASE_URL.format(month)
        print(f"Indiriliyor: {filename}...")
        urllib.request.urlretrieve(url, filepath)
        print(f"Indirildi: {filename}")
    else:
        print(f"Mevcut: {filename}")
    
    total_size += os.path.getsize(filepath)

print(f"\nToplam dosya boyutu: {total_size / 1024**2:.1f} MB")
print(f"Dosya sayisi: {len(taxi_files)}")

Mevcut: yellow_tripdata_2023-01.parquet
Mevcut: yellow_tripdata_2023-02.parquet
Mevcut: yellow_tripdata_2023-03.parquet
Mevcut: yellow_tripdata_2023-04.parquet
Mevcut: yellow_tripdata_2023-05.parquet
Mevcut: yellow_tripdata_2023-06.parquet
Mevcut: yellow_tripdata_2023-07.parquet
Mevcut: yellow_tripdata_2023-08.parquet
Mevcut: yellow_tripdata_2023-09.parquet
Mevcut: yellow_tripdata_2023-10.parquet
Mevcut: yellow_tripdata_2023-11.parquet
Mevcut: yellow_tripdata_2023-12.parquet

Toplam dosya boyutu: 606.3 MB
Dosya sayisi: 12


## 2. Veri Yukleme (Eager Mode)

Polars, Parquet dosyalarini cok hizli yukler:
- Multi-threaded I/O
- Apache Arrow native format
- Zero-copy reading

In [9]:
# Bellek durumu
print("BELLEK DURUMU (Yukleme Oncesi)")
print("="*50)
print(f"Kullanilan: {get_memory_mb():.0f} MB")
print(f"Kullanilabilir: {psutil.virtual_memory().available / 1024**3:.1f} GB")

BELLEK DURUMU (Yukleme Oncesi)
Kullanilan: 5817 MB
Kullanilabilir: 69.1 GB


In [10]:
# 12 aylik veriyi yukle - POLARS cok daha hizli!
# Not: NYC Taxi dosyalarinda bazi aylarda farkli sutunlar ve tipler var
# diagonal_relaxed: farkli sema + farkli tipleri otomatik cast eder
def load_all_data():
    dfs = []
    for filepath in taxi_files:
        df_month = pl.read_parquet(filepath)
        dfs.append(df_month)
    return pl.concat(dfs, how='diagonal_relaxed')

df = benchmark(load_all_data, 'load_data')
print(f"\nToplam satir: {len(df):,}")
print(f"Sutun sayisi: {len(df.columns)}")
print(f"Bellek: {df.estimated_size() / 1024**3:.2f} GB")


Operation: load_data
Sure: 2.503 saniye
Bellek: 6821.56 MB

Toplam satir: 38,310,226
Sutun sayisi: 20
Bellek: 5.46 GB


In [11]:
# Veri yapisi
print("SUTUN BILGILERI")
print("="*60)
print(df.schema)
print(f"\nIlk 5 satir:")
df.head()

SUTUN BILGILERI
Schema({'VendorID': Int64, 'tpep_pickup_datetime': Datetime(time_unit='ns', time_zone=None), 'tpep_dropoff_datetime': Datetime(time_unit='ns', time_zone=None), 'passenger_count': Float64, 'trip_distance': Float64, 'RatecodeID': Float64, 'store_and_fwd_flag': String, 'PULocationID': Int64, 'DOLocationID': Int64, 'payment_type': Int64, 'fare_amount': Float64, 'extra': Float64, 'mta_tax': Float64, 'tip_amount': Float64, 'tolls_amount': Float64, 'improvement_surcharge': Float64, 'total_amount': Float64, 'congestion_surcharge': Float64, 'airport_fee': Float64, 'Airport_fee': Float64})

Ilk 5 satir:


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,"""N""",161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,
2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,"""N""",43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,
2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,"""N""",48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,
1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,"""N""",138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,
2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,"""N""",107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,


## 3. Temel Islemler

### 3.1 Filtreleme

In [12]:
def filter_trips():
    return df.filter(
        (pl.col('trip_distance') > 5) &
        (pl.col('fare_amount') > 20) &
        (pl.col('fare_amount') < 500)
    )

df_filtered = benchmark(filter_trips, 'filter_trips')
print(f"\nFiltrelenmis: {len(df_filtered):,} / {len(df):,}")
print(f"Oran: {len(df_filtered)/len(df)*100:.2f}%")


Operation: filter_trips
Sure: 0.194 saniye
Bellek: 1086.64 MB

Filtrelenmis: 6,542,088 / 38,310,226
Oran: 17.08%


### 3.2 GroupBy - Saatlik Analiz

In [13]:
def groupby_hour():
    return (
        df
        .with_columns(pl.col('tpep_pickup_datetime').dt.hour().alias('pickup_hour'))
        .group_by('pickup_hour')
        .agg([
            pl.col('fare_amount').mean().alias('avg_fare'),
            pl.col('trip_distance').mean().alias('avg_distance'),
            pl.col('tip_amount').mean().alias('avg_tip'),
            pl.count().alias('trip_count')
        ])
        .sort('pickup_hour')
    )

df_hourly = benchmark(groupby_hour, 'groupby_hour')
df_hourly


Operation: groupby_hour
Sure: 0.278 saniye
Bellek: 304.11 MB


(Deprecated in version 0.20.5)
  pl.count().alias('trip_count')


pickup_hour,avg_fare,avg_distance,avg_tip,trip_count
i8,f64,f64,f64,u32
0,19.85742,4.031465,3.496476,1088628
1,18.010486,3.631036,3.17027,731321
2,16.919905,4.401059,2.939176,483366
3,17.911575,4.760022,3.011689,319641
4,23.55361,10.257194,3.652719,217492
…,…,…,…,…
19,18.193056,3.712548,3.521175,2416756
20,18.680884,3.564202,3.560188,2153613
21,18.829774,3.82658,3.606422,2151209
22,19.636823,4.05853,3.672256,1994411


### 3.3 GroupBy - Aylik Analiz

In [14]:
def groupby_month():
    return (
        df
        .with_columns(pl.col('tpep_pickup_datetime').dt.month().alias('pickup_month'))
        .group_by('pickup_month')
        .agg([
            pl.col('fare_amount').mean().alias('avg_fare'),
            pl.col('fare_amount').sum().alias('total_fare'),
            pl.col('trip_distance').mean().alias('avg_distance'),
            pl.col('tip_amount').sum().alias('total_tips'),
            pl.count().alias('trip_count')
        ])
        .sort('pickup_month')
    )

df_monthly = benchmark(groupby_month, 'groupby_month')
df_monthly


Operation: groupby_month
Sure: 0.268 saniye
Bellek: 223.30 MB


(Deprecated in version 0.20.5)
  pl.count().alias('trip_count')


pickup_month,avg_fare,total_fare,avg_distance,total_tips,trip_count
i8,f64,f64,f64,f64,u32
1,18.367123,5.6328e7,3.847384,1.0329e7,3066759
2,18.220242,5.3094e7,3.867976,9.8632e6,2914003
3,18.908529,6.4358e7,3.903942,1.1897e7,3403660
4,19.360496,6.3662e7,4.09616,1.1548e7,3288248
5,19.876823,6.9840e7,4.345793,1.2684e7,3513664
…,…,…,…,…,…
8,19.718396,5.5689e7,4.782777,9.6323e6,2824201
9,20.671103,5.8845e7,4.274258,1.0320e7,2846741
10,20.061776,7.0663e7,3.926678,1.2796e7,3522280
11,19.651384,6.5630e7,3.632734,1.2084e7,3339732


### 3.4 Odeme Tipi Analizi

In [16]:
def payment_analysis():
    return (
        df
        .with_columns(
            pl.when(pl.col('payment_type') == 1).then(pl.lit('Credit Card'))
            .when(pl.col('payment_type') == 2).then(pl.lit('Cash'))
            .when(pl.col('payment_type') == 3).then(pl.lit('No Charge'))
            .when(pl.col('payment_type') == 4).then(pl.lit('Dispute'))
            .when(pl.col('payment_type') == 5).then(pl.lit('Unknown'))
            .when(pl.col('payment_type') == 6).then(pl.lit('Voided'))
            .otherwise(pl.lit('Other'))
            .alias('payment_name')
        )
        .group_by('payment_name')
        .agg([
            pl.col('fare_amount').mean().alias('avg_fare'),
            pl.col('tip_amount').mean().alias('avg_tip'),
            pl.col('total_amount').sum().alias('total_revenue'),
            pl.count().alias('trip_count')
        ])
        .sort('trip_count', descending=True)
    )

df_payment = benchmark(payment_analysis, 'payment_analysis')
df_payment

(Deprecated in version 0.20.5)
  pl.count().alias('trip_count')



Operation: payment_analysis
Sure: 0.393 saniye
Bellek: 497.53 MB


payment_name,avg_fare,avg_tip,total_revenue,trip_count
str,f64,f64,f64,u32
"""Credit Card""",19.814779,4.396301,890780000.0,29856932
"""Cash""",19.388154,0.001941,157050000.0,6405059
"""Other""",22.373175,2.773958,38883000.0,1309356
"""Dispute""",1.724659,0.05158,1130900.0,498015
"""No Charge""",8.127843,0.035321,2540600.0,240862
"""Unknown""",0.0,0.0,0.0,2


### 3.5 Location Analizi

In [17]:
def location_analysis():
    return (
        df
        .group_by('PULocationID')
        .agg([
            pl.count().alias('trip_count'),
            pl.col('fare_amount').mean().alias('avg_fare'),
            pl.col('trip_distance').mean().alias('avg_distance'),
            pl.col('tip_amount').mean().alias('avg_tip')
        ])
        .sort('trip_count', descending=True)
    )

df_locations = benchmark(location_analysis, 'location_analysis')
print("\nEn Populer 15 Pickup Lokasyonu:")
df_locations.head(15)

(Deprecated in version 0.20.5)
  pl.count().alias('trip_count'),



Operation: location_analysis
Sure: 0.895 saniye
Bellek: 3863.68 MB

En Populer 15 Pickup Lokasyonu:


PULocationID,trip_count,avg_fare,avg_distance,avg_tip
i64,u32,f64,f64,f64
132,1992304,61.080875,15.829447,8.688623
237,1791795,13.009022,1.92618,2.665287
161,1766041,16.473684,2.637109,3.230466
236,1596584,13.584188,2.137991,2.766769
162,1353753,15.879083,2.648804,3.152426
…,…,…,…,…
163,1110093,16.413195,2.627644,3.17447
239,1060722,14.579246,2.847327,2.945464
234,1028897,14.753209,2.331073,2.954009
48,1020565,16.118957,2.765539,2.945637


### 3.6 En Pahali Yolculuklar

In [18]:
def top_expensive_trips():
    return (
        df
        .select([
            'tpep_pickup_datetime', 'trip_distance',
            'fare_amount', 'tip_amount', 'total_amount',
            'PULocationID', 'DOLocationID'
        ])
        .sort('total_amount', descending=True)
        .head(1000)
    )

df_expensive = benchmark(top_expensive_trips, 'top_expensive_trips')
print("\nEn Pahali 10 Yolculuk:")
df_expensive.head(10)


Operation: top_expensive_trips
Sure: 1.830 saniye
Bellek: 27.79 MB

En Pahali 10 Yolculuk:


tpep_pickup_datetime,trip_distance,fare_amount,tip_amount,total_amount,PULocationID,DOLocationID
datetime[ns],f64,f64,f64,f64,i64,i64
2023-06-12 13:33:06,1.5,386983.63,0.0,386987.63,100,50
2023-09-02 15:15:39,21.3,187502.96,0.0,187513.9,239,132
2023-09-05 10:16:13,0.7,143163.45,0.0,143167.45,249,90
2023-09-11 14:54:55,0.0,19152.9,0.0,29156.9,43,264
2023-09-30 17:58:34,0.0,12015.47,0.0,12015.47,163,264
2023-10-23 20:43:13,0.0,6339.0,0.0,6339.0,48,125
2023-05-16 10:12:28,40.81,6300.9,0.0,6304.9,239,264
2023-12-12 07:51:03,0.0,95.16,4174.0,4269.16,264,264
2023-04-06 14:08:51,12.58,2449.5,0.0,2451.0,216,265
2023-12-20 18:49:49,6.7,2320.11,0.0,2372.79,233,40


### 3.7 Rolling Statistics (Hareketli Ortalama)

In [19]:
def daily_rolling_stats():
    # Gunluk toplam
    daily = (
        df
        .with_columns(pl.col('tpep_pickup_datetime').dt.date().alias('pickup_date'))
        .group_by('pickup_date')
        .agg([
            pl.col('fare_amount').sum().alias('daily_fare'),
            pl.col('trip_distance').sum().alias('daily_distance'),
            pl.count().alias('trip_count')
        ])
        .sort('pickup_date')
    )
    
    # 7 gunluk hareketli ortalama
    return daily.with_columns([
        pl.col('daily_fare').rolling_mean(window_size=7).alias('fare_7d_avg'),
        pl.col('trip_count').rolling_mean(window_size=7).alias('trips_7d_avg')
    ])

df_daily = benchmark(daily_rolling_stats, 'daily_rolling_stats')
df_daily.tail(10)

(Deprecated in version 0.20.5)
  pl.count().alias('trip_count')



Operation: daily_rolling_stats
Sure: 0.771 saniye
Bellek: 2352.81 MB


pickup_date,daily_fare,daily_distance,trip_count,fare_7d_avg,trips_7d_avg
date,f64,f64,u32,f64,f64
2023-12-24,1165500.0,213135.12,65697,1928500.0,102830.857143
2023-12-25,936450.2,232377.3,45466,1787300.0,94814.142857
2023-12-26,1427900.0,287616.43,68261,1663200.0,87131.0
2023-12-27,1709600.0,309827.57,81293,1586200.0,81787.0
2023-12-28,1770600.0,314216.55,84419,1521100.0,77119.142857
2023-12-29,1770300.0,317479.94,86171,1478400.0,73517.0
2023-12-30,1680600.0,320304.65,82501,1494400.0,73401.142857
2023-12-31,1472300.0,281323.76,76955,1538300.0,75009.428571
2024-01-01,17.2,2.41,2,1404500.0,68514.571429
2024-01-03,354.09,38.28,4,1200500.0,58763.571429


## 4. Lazy Evaluation (Query Optimization)

Polars'in en guclu ozelliklerinden biri Lazy evaluation:
- Query'ler hemen calistirilmaz
- Optimizer gereksiz islemleri eler
- Predicate/Projection pushdown

In [20]:
# Lazy mode - query optimization goster
print("LAZY EVALUATION DEMO")
print("="*50)

# LazyFrame olustur - her dosyayi ayri scan edip concat
lazy_dfs = [pl.scan_parquet(f) for f in taxi_files]
lazy_df = pl.concat(lazy_dfs, how='diagonal_relaxed')

# Lazy query - henuz calismiyor!
lazy_query = (
    lazy_df
    .filter(
        (pl.col('trip_distance') > 10) &
        (pl.col('fare_amount') > 50) &
        (pl.col('tip_amount') > 10)
    )
    .select(['tpep_pickup_datetime', 'trip_distance', 'fare_amount', 'tip_amount', 'total_amount'])
    .sort('total_amount', descending=True)
    .head(100)
)

print("\nQuery Plan (Optimized):")
print(lazy_query.explain())

LAZY EVALUATION DEMO

Query Plan (Optimized):
SORT BY [col("total_amount")]
  UNION
    PLAN 0:
      Parquet SCAN [data/yellow_tripdata_2023-01.parquet] [id: 134690868744608]
      PROJECT 5/19 COLUMNS
      SELECTION: [([([(col("trip_distance")) > (10.0)]) & ([(col("tip_amount")) > (10.0)])]) & ([(col("fare_amount")) > (50.0)])]
    PLAN 1:
      Parquet SCAN [data/yellow_tripdata_2023-02.parquet] [id: 134690868744640]
      PROJECT 5/19 COLUMNS
      SELECTION: [([([(col("tip_amount")) > (10.0)]) & ([(col("fare_amount")) > (50.0)])]) & ([(col("trip_distance")) > (10.0)])]
    PLAN 2:
      Parquet SCAN [data/yellow_tripdata_2023-03.parquet] [id: 134690868744656]
      PROJECT 5/19 COLUMNS
      SELECTION: [([([(col("trip_distance")) > (10.0)]) & ([(col("fare_amount")) > (50.0)])]) & ([(col("tip_amount")) > (10.0)])]
    PLAN 3:
      Parquet SCAN [data/yellow_tripdata_2023-04.parquet] [id: 134690868744672]
      PROJECT 5/19 COLUMNS
      SELECTION: [([([(col("tip_amount")) > (10.0)

In [21]:
# Simdi calistir
def lazy_execute():
    return lazy_query.collect()

df_lazy_result = benchmark(lazy_execute, 'lazy_top_trips')
print("\nEn Iyi 10 Uzun Mesafe + Yuksek Bahsis Yolculuklari:")
df_lazy_result.head(10)


Operation: lazy_top_trips
Sure: 0.281 saniye
Bellek: 7.91 MB

En Iyi 10 Uzun Mesafe + Yuksek Bahsis Yolculuklari:


tpep_pickup_datetime,trip_distance,fare_amount,tip_amount,total_amount
datetime[ns],f64,f64,f64,f64
2023-05-31 10:07:46,274.9,1055.5,222.22,1327.52
2023-06-27 18:47:04,349.0,999.0,206.65,1239.95
2023-06-02 11:05:49,214.5,810.1,140.0,981.6
2023-02-26 18:37:48,193.9,700.0,175.25,876.25
2023-06-01 16:11:13,130.3,480.0,180.0,687.05
2023-03-13 19:20:18,176.9,500.0,110.15,660.95
2023-09-03 18:22:16,91.3,603.6,50.0,656.85
2023-05-31 15:44:45,91.3,637.9,15.0,656.15
2023-06-11 10:39:17,63.4,513.0,107.4,644.45
2023-07-30 22:27:51,82.4,553.9,80.0,636.4


## 5. Polars vs Pandas Karsilastirma

In [22]:
# Pandas sonuclarini yukle
pandas_results = None
try:
    with open('results/pandas_benchmark.json', 'r') as f:
        pandas_results = json.load(f)
except FileNotFoundError:
    print("Pandas benchmark bulunamadi. Once 01_pandas_baseline.ipynb calistirin.")

In [23]:
if pandas_results:
    print("\n" + "="*70)
    print("POLARS vs PANDAS KARSILASTIRMA (12 Ay NYC Taxi)")
    print("="*70)
    print(f"{'Islem':<25} {'Pandas (s)':<12} {'Polars (s)':<12} {'Hizlanma':<10}")
    print("-"*70)
    
    total_pandas = 0
    total_polars = 0
    
    for op in results['operations']:
        if op in pandas_results['operations']:
            pandas_time = pandas_results['operations'][op]['duration_sec']
            polars_time = results['operations'][op]['duration_sec']
            speedup = pandas_time / polars_time if polars_time > 0 else float('inf')
            print(f"{op:<25} {pandas_time:<12.3f} {polars_time:<12.3f} {speedup:<10.1f}x")
            total_pandas += pandas_time
            total_polars += polars_time
    
    print("-"*70)
    overall_speedup = total_pandas / total_polars if total_polars > 0 else float('inf')
    print(f"{'TOPLAM':<25} {total_pandas:<12.3f} {total_polars:<12.3f} {overall_speedup:<10.1f}x")
else:
    print("Pandas benchmark verisi bulunamadi.")


POLARS vs PANDAS KARSILASTIRMA (12 Ay NYC Taxi)
Islem                     Pandas (s)   Polars (s)   Hizlanma  
----------------------------------------------------------------------
load_data                 5.228        2.503        2.1       x
filter_trips              1.232        0.194        6.4       x
groupby_hour              2.443        0.278        8.8       x
groupby_month             3.600        0.268        13.4      x
payment_analysis          4.316        0.393        11.0      x
location_analysis         1.295        0.895        1.4       x
top_expensive_trips       9.214        1.830        5.0       x
daily_rolling_stats       34.781       0.771        45.1      x
----------------------------------------------------------------------
TOPLAM                    62.109       7.132        8.7       x


## 6. Bellek Kullanimi

In [24]:
print("\nBELLEK KULLANIMI")
print("="*50)
print(f"DataFrame boyutu: {df.estimated_size() / 1024**3:.2f} GB")
print(f"Toplam process: {get_memory_mb() / 1024:.2f} GB")

if pandas_results:
    pandas_mem = pandas_results.get('total_memory_gb', 0)
    polars_mem = get_memory_mb() / 1024
    savings = (1 - polars_mem / pandas_mem) * 100 if pandas_mem > 0 else 0
    print(f"\nPandas bellek: {pandas_mem:.2f} GB")
    print(f"Polars bellek: {polars_mem:.2f} GB")
    print(f"Tasarruf: {savings:.1f}%")


BELLEK KULLANIMI
DataFrame boyutu: 5.46 GB
Toplam process: 20.51 GB

Pandas bellek: 8.98 GB
Polars bellek: 20.51 GB
Tasarruf: -128.4%


## 7. Sonuclari Kaydet

In [25]:
results['total_memory_mb'] = round(get_memory_mb(), 2)
results['total_memory_gb'] = round(get_memory_mb() / 1024, 2)
results['row_count'] = len(df)
results['dataframe_size_gb'] = round(df.estimated_size() / 1024**3, 2)

os.makedirs('results', exist_ok=True)
with open('results/polars_benchmark.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Sonuclar kaydedildi: results/polars_benchmark.json")
print("\n" + json.dumps(results, indent=2))

Sonuclar kaydedildi: results/polars_benchmark.json

{
  "framework": "polars",
  "dataset": "nyc_taxi_12_months",
  "operations": {
    "load_data": {
      "duration_sec": 2.503,
      "memory_mb": 6821.56
    },
    "filter_trips": {
      "duration_sec": 0.194,
      "memory_mb": 1086.64
    },
    "groupby_hour": {
      "duration_sec": 0.278,
      "memory_mb": 304.11
    },
    "groupby_month": {
      "duration_sec": 0.268,
      "memory_mb": 223.3
    },
    "payment_analysis": {
      "duration_sec": 0.393,
      "memory_mb": 497.53
    },
    "location_analysis": {
      "duration_sec": 0.895,
      "memory_mb": 3863.68
    },
    "top_expensive_trips": {
      "duration_sec": 1.83,
      "memory_mb": 27.79
    },
    "daily_rolling_stats": {
      "duration_sec": 0.771,
      "memory_mb": 2352.81
    },
    "lazy_top_trips": {
      "duration_sec": 0.281,
      "memory_mb": 7.91
    }
  },
  "total_memory_mb": 21005.52,
  "total_memory_gb": 20.51,
  "row_count": 38310226,
  

---

## Sonraki Adim

TB+ veri icin distributed processing: **Dask**

-> `03_dask_demo.ipynb`