# Installation

`pip`

```
pip install git+https://github.com/d2cml-ai/csdid-pyspark
pip install git+https://github.com/d2cml-ai/drdid
pip install pyspark, findspark
```

`pipenv`

```
#csdid
pipenv install -e git+https://github.com/d2cml-ai/csdid-pyspark#egg=csdidspark
pipenv install --editable git+https://github.com/d2cml-ai/drdid#egg=drdid
pipenv install pyspark, findspark
```

## Spark Configuration 


- `.config('spark.master', 'local[4]')`: Local execution with 4 CPU cores.
- `.config('spark.executor.memory', '1g')`: Memory allocation for each cluster.
- `.config('spark.driver.cores', '5')`: Sets the number of CPU cores to be allocated to the driver.
- `.config('spark.rdd.compress', True)`: Enables RDD (Resilient Distributed Dataset) compression to save memory space and enhance performance.

In [14]:
import findspark
import warnings
findspark.init()

from pyspark.sql import  SparkSession
spark = SparkSession.builder.appName('pyspark-csdid')\
    .config('spark.master', 'local[4]')\
    .config('spark.executor.memory', '1g')\
    .config('spark.driver.cores', '5')\
    .config('spark.rdd.compress', True)\
    .getOrCreate()

# Data

## Download 

In [15]:
%%time
import os, requests
datasets_online = [
	'https://www.dropbox.com/scl/fi/e6xzb1nsiqixnhx7pvptn/5g10t.csv?rlkey=nalduzdksxxohgrth69fberlq&dl=1',
	'https://www.dropbox.com/scl/fi/edglr6xerr39mk8ch3ey7/5g40t.csv?rlkey=kchelnkdmy9kocl6ynu4aodsw&dl=1',
	'https://www.dropbox.com/scl/fi/twoqqb97mn6fo22dmdjbq/20g40t.csv?rlkey=imphdyf0k4nex00hdwotuvfsj&dl=1'
]

data_names = ['5g10t', '5g40t', '20g40t']

data_path = 'data'
data_names1 = [data_path + '/' + names + '.csv' for names in data_names]

if not os.path.exists(data_path):
	os.mkdir(data_path)

def download_data(_from, _to):
	if os.path.exists(_to):
		return
	data_ = requests.get(_from)
	with open(_to, 'wb') as f:
		f.write(data_.content)

for i, j in zip(datasets_online, data_names1):
	# print(i, j)
	download_data(i, j)


CPU times: total: 0 ns
Wall time: 0 ns


## CSdid with PySpark

In [20]:
from csdids.ATTgt import ATTgt
def csdid_estimate(data_path):
	yname, gname, idname, tname = 'Y', 'G', 'id', 'period'
	data = spark.read.csv(data_path, header=True, inferSchema=True)
	attgt = ATTgt(data=data, tname=tname, gname=gname, yname=yname, idname=idname)
	attgt.fit(bstrap=True)

### 5 Groups, 1 Milllon

In [21]:
%%time
csdid_estimate(data_names1[0])

100%|██████████| 9/9 [00:35<00:00,  3.97s/it]
100%|██████████| 9/9 [00:34<00:00,  3.87s/it]
100%|██████████| 9/9 [00:34<00:00,  3.85s/it]
100%|██████████| 9/9 [00:34<00:00,  3.81s/it]
100%|██████████| 9/9 [00:34<00:00,  3.82s/it]
100%|██████████| 5/5 [02:53<00:00, 34.78s/it]


CPU times: total: 1min 22s
Wall time: 3min 15s


### 5 Groups, 4 Milllons

In [None]:
%%time
csdid_estimate(data_names1[1])

### 20 Groups, 4 Milllons

In [None]:
%%time
csdid_estimate(data_names1[2])

In [3]:
spark.stop()