## Install Modin[Ray]

In [1]:
! pip install modin[ray]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import time
import ray
import pandas
import warnings
import urllib.request
import modin.pandas as pd

os.environ["MODIN_ENGINE"] = "ray"
warnings.filterwarnings('ignore')

In [3]:
ray.init()

2023-01-16 10:20:39,959	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.16
Ray version:,2.2.0
Dashboard:,http://127.0.0.1:8265


## Get Dataset

In [4]:
file = "../dataset/taxi.csv"

In [5]:
s3_path = "https://modin-datasets.s3.amazonaws.com/testing/yellow_tripdata_2015-01.csv"
urllib.request.urlretrieve(s3_path, file)

## 200 MB Dataset

('/content/drive/MyDrive/data science cook book/_dataset/taxi.csv',
 <http.client.HTTPMessage at 0x7f0e5bd88df0>)

## Read CSV

In [6]:
start = time.time()

pandas_df = pandas.read_csv(file, parse_dates=["tpep_pickup_datetime", 
                                               "tpep_dropoff_datetime"], 
                            quoting=3)

end = time.time()
pandas_duration = end - start
print("Time to read with pandas: {} seconds".format(round(pandas_duration, 3)))

Time to read with pandas: 7.411 seconds


In [7]:
start = time.time()

modin_df = pd.read_csv(file, parse_dates=["tpep_pickup_datetime", 
                                          "tpep_dropoff_datetime"], 
                       quoting=3)

end = time.time()
modin_duration = end - start
print("Time to read with Modin: {} seconds".format(round(modin_duration, 3)))

print("Modin is {}x faster than pandas at `read_csv`!".format(round(pandas_duration / modin_duration, 2)))

Time to read with Modin: 12.482 seconds
Modin is 0.59x faster than pandas at `read_csv`!


## Concat

In [8]:
start = time.time()

big_pandas_df = pandas.concat([pandas_df for _ in range(20)])
print(big_pandas_df.shape)
end = time.time()
pandas_duration = end - start
print("Time to concat with pandas: {} seconds".format(round(pandas_duration, 3)))


(38670140, 21)
Time to concat with pandas: 7.119 seconds


In [9]:
start = time.time()

big_modin_df = pd.concat([modin_df for _ in range(20)])
print(big_modin_df.shape)
end = time.time()
modin_duration = end - start
print("Time to concat with Modin: {} seconds".format(round(modin_duration, 3)))

print("Modin is {}x faster than pandas at `concat`!".format(round(pandas_duration / modin_duration, 2)))

(38670140, 21)
Time to concat with Modin: 0.089 seconds
Modin is 79.93x faster than pandas at `concat`!
