In [1]:
# coding: utf-8

### Import Packages
import pandas as pd
import numpy as np
import elasticsearch
import re
import json
from datetime import datetime
from elasticsearch import helpers
from time import perf_counter
import concurrent
import multiprocessing

#from numba import jit

# Define elasticsearch class
es = elasticsearch.Elasticsearch()



In [2]:
### Helper Functions
# convert np.int64 into int. json.dumps does not work with int64
class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.int64):
            return np.int(obj)
        # else
        return json.JSONEncoder.default(self, obj)

# Convert datestamp into ISO format
def str_to_iso(text):
    if text != '':
        for fmt in ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
            try:
                return datetime.isoformat(datetime.strptime(text, fmt))
            except ValueError:
                pass
        raise ValueError('no valid date format found')
    else:
        return None

# Custom groupby function
def concatdf(x):
    if len(x) > 1:  #if multiple values
        return list(x)
    else: #if single value
        return x.iloc[0]

In [3]:
### Import Data
# Load projects, resources & donations data
print("Loading datasets")
start = perf_counter()
projects = pd.read_csv('./data/opendata_projects000.gz', escapechar='\\', names=['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid', 'school_latitude', 'school_longitude', 'school_city', 'school_state', 'school_zip', 'school_metro', 'school_district', 'school_county', 'school_charter', 'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_prefix', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_subject', 'primary_focus_area' ,'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'vendor_shipping_charges', 'sales_tax', 'payment_processing_charges', 'fulfillment_labor_materials', 'total_price_excluding_optional_support', 'total_price_including_optional_support', 'students_reached', 'total_donations', 'num_donors', 'eligible_double_your_impact_match', 'eligible_almost_home_match', 'funding_status', 'date_posted', 'date_completed', 'date_thank_you_packet_mailed', 'date_expiration'])
donations = pd.read_csv('./data/opendata_donations000.gz', escapechar='\\', names=['donationid', 'projectid', 'donor_acctid', 'cartid', 'donor_city', 'donor_state', 'donor_zip', 'is_teacher_acct', 'donation_timestamp', 'donation_to_project', 'donation_optional_support', 'donation_total', 'donation_included_optional_support', 'payment_method', 'payment_included_acct_credit', 'payment_included_campaign_gift_card', 'payment_included_web_purchased_gift_card', 'payment_was_promo_matched', 'is_teacher_referred', 'giving_page_id', 'giving_page_type', 'for_honoree', 'thank_you_packet_mailed'])
resources = pd.read_csv('./data/opendata_resources000.gz', escapechar='\\', names=['resourceid', 'projectid', 'vendorid', 'vendor_name', 'item_name', 'item_number', 'item_unit_price', 'item_quantity'])
end = perf_counter()
print(end - start)

Loading datasets
127.89902660000371


In [4]:
### Data Cleanup
# replace nan with ''
print("Cleaning Data")
start = perf_counter()
projects = projects.fillna('')
donations = donations.fillna('')
resources = resources.fillna('')

#  Clean up column names: remove _ at the start of column name
donations.columns = donations.columns.map(lambda x: re.sub('^ ', '', x))
donations.columns = donations.columns.map(lambda x: re.sub('^_', '', x))
projects.columns = projects.columns.map(lambda x: re.sub('^_', '', x))
resources.columns = resources.columns.map(lambda x: re.sub('^ ', '', x))
resources.columns = resources.columns.map(lambda x: re.sub('^_', '', x))

# Add quotes around projectid values to match format in projects / donations column
resources['projectid'] = resources['projectid'].map(lambda x: '"' + x +'"')

# Add resource_prefix to column names
resources.rename(columns={'vendorid': 'resource_vendorid', 'vendor_name': 'resource_vendor_name', 'item_name': 'resource_item_name',
       'item_number' :'resource_item_number', "item_unit_price": 'resource_item_unit_price',
       'item_quantity': 'resource_item_quantity'}, inplace=True)
end = perf_counter()
print(end - start)


Cleaning Data
47.48128190002171


In [5]:
### Merge multiple resource row per projectid into a single row
# NOTE: section may take a few minutes to execute
print("Grouping Data by ProjectId")
start = perf_counter()

concat_resource = pd.DataFrame()
# a DataFrameGroupBy
resources_grouped_by_projectid = resources.groupby('projectid')

# try and set this up for parallel operations later
def do_concat(one_group_by):
    return one_group_by.apply(lambda x: concatdf(x))

for a in resources.columns.values:
    # Iterate across the DataFrameGroupBy operating on one SeriesGroupBy at a time
    print("column "+ a)
    concat_resource[a]=do_concat(resources_grouped_by_projectid[a])
    # print(concat_resource[a])

concat_resource['projectid'] = concat_resource.index;
concat_resource.reset_index(drop=True);
concat_resource.index.name = None
concat_resource.set_index('projectid', inplace=True, drop=True)

end = perf_counter()
print(end - start)

Grouping Data by ProjectId
column resourceid
column projectid
column resource_vendorid
column resource_vendor_name
column resource_item_name
column resource_item_number
column resource_item_unit_price
column resource_item_quantity
1035.1818780999747


In [6]:
### Rename Project columns
print("Renaming project columns")
start = perf_counter()

projects.rename(columns=lambda x: "project_" + x, inplace=True)
projects.rename(columns={"project_projectid": "projectid"}, inplace=True)
projects.columns.values
projects.index.name = None
projects.set_index('projectid', inplace=True, drop=True)

end = perf_counter()
print(end - start)

Renaming project columns
0.05398359999526292


In [7]:
#### Merge data into single frame
print("Merging datasets")
start = perf_counter()
data = pd.merge(projects, concat_resource, how='left', right_on='projectid', left_on='projectid')
data = pd.merge(donations, data, how='left', right_on='projectid', left_on='projectid')
data = data.fillna('')
end = perf_counter()
print(end - start)

Merging datasets
91.23014629998943


In [8]:
#### Process columns
# Modify date formats
print("Modifying Date Formats")
start = perf_counter()
data['project_date_expiration'] = data['project_date_expiration'].map(lambda x: str_to_iso(x));
data['project_date_posted'] = data['project_date_posted'].map(lambda x: str_to_iso(x))
data['project_date_thank_you_packet_mailed'] = data['project_date_thank_you_packet_mailed'].map(lambda x: str_to_iso(x))
data['project_date_completed'] = data['project_date_completed'].map(lambda x: str_to_iso(x))
data['donation_timestamp'] = data['donation_timestamp'].map(lambda x: str_to_iso(x))

# Create location field that combines lat/lon information
data['project_location'] = data[['project_school_longitude','project_school_latitude']].values.tolist()
del(data['project_school_latitude'])  # delete latitude field
del(data['project_school_longitude']) # delete longitude

end = perf_counter()
print(end - start)

Modifying Date Formats
1603.7444493999938


In [10]:
### Create and configure Elasticsearch index
print("Preparing to Index to ES")
start = perf_counter()
# Name of index and document type
index_name = 'donorschoose'
doc_name = 'donation'

# Delete donorschoose index if one does exist
if es.indices.exists(index_name):
    es.indices.delete(index_name)

# Create donorschoose index
es.indices.create(index_name)

# Add mapping
with open('donorschoose_mapping.json') as json_mapping:
    d = json.load(json_mapping)

es.indices.put_mapping(index=index_name, doc_type=doc_name, body=d, include_type_name=True)
end = perf_counter()
print(end - start)

Preparing to Index to ES
0.4676792999962345




In [11]:
### function used by all below
def read_data(df):
    for don_id, thisDonation in df.iterrows():
        # print every 10000 iteration
        if don_id % 10000 == 0:
            print('{} / {}'.format(don_id, len(df.index) ))
        doc={}
        doc["_index"]=index_name
        doc["_id"]=thisDonation['donationid']
        doc["_type"]=doc_name
        doc["_source"]=thisDonation.to_dict()
        if don_id % 100000 == 0:
            print('doc: {}'.format(doc))
        yield doc

## Execution blocks
Only run one of these three options
* es parallel-bulk
* es bulk
* Pool es bulk

### parallel bulk - sometimes exits early on last chunk

In [12]:
### Index Data into Elasticsearch - parallel bulk - default parallel_bulk thread_count = 4
print("Indexing parallel_bulk")
start = perf_counter()
# parallel_bulk returns generators which must be consumed https://elasticsearch-py.readthedocs.io/en/master/helpers.html
# default request_timeout=10
# 1000 may have timeout
for success, info in helpers.parallel_bulk(es, read_data(data),thread_count=8, request_timeout=20.0, chunk_size=500, index=index_name,doc_type=doc_name):
    if not success:
        print('A document failed:', info)

end = perf_counter()
print(end - start)

Indexing parallel_bulk
0 / 6211956
doc: {'_index': 'donorschoose', '_id': '0000023f507999464aa2b78875b7e5d6', '_type': 'donation', '_source': {'donationid': '0000023f507999464aa2b78875b7e5d6', 'projectid': '69bf3a609bb4673818e0eebd004ea504', 'donor_acctid': '22c50856b0824db76daf527da6af9abf', 'cartid': '39e6f19c1b8e7652bf6d1a2c7c7b8819', 'donor_city': '', 'donor_state': '', 'donor_zip': '', 'is_teacher_acct': 'f', 'donation_timestamp': '2011-02-13T11:07:19.349000', 'donation_to_project': 8.5, 'donation_optional_support': 1.5, 'donation_total': 10.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'giving_page_type': '', 'for_honoree': 'f', 'thank_you_packet_mailed': 'f', 'project_teacher_acctid': 'b023fd4d537d99f7cf728306b5377cea', 'project_schoolid



10000 / 6211956
20000 / 6211956
30000 / 6211956
40000 / 6211956
50000 / 6211956
60000 / 6211956
70000 / 6211956
80000 / 6211956
90000 / 6211956
100000 / 6211956
doc: {'_index': 'donorschoose', '_id': '0422b88431fd0691499ebd3a8133cdbf', '_type': 'donation', '_source': {'donationid': '0422b88431fd0691499ebd3a8133cdbf', 'projectid': '27903f4543c537e3ae04b04e56c9d493', 'donor_acctid': 'af50cb46f3d253b2f65e07e4ad901453', 'cartid': '', 'donor_city': '', 'donor_state': '  ', 'donor_zip': 100.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2015-10-29T12:38:05.671000', 'donation_to_project': 85.0, 'donation_optional_support': 15.0, 'donation_total': 100.0, 'donation_included_optional_support': 't', 'payment_method': 'promo_code_match', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 't', 'giving_page_id': 'd3b6bcb548e3155d9ea2717ddc025216', 'giving_page_ty

310000 / 6211956
320000 / 6211956
330000 / 6211956
340000 / 6211956
350000 / 6211956
360000 / 6211956
370000 / 6211956
380000 / 6211956
390000 / 6211956
400000 / 6211956
doc: {'_index': 'donorschoose', '_id': '10793364ad4c13b0c67ca8ac49489944', '_type': 'donation', '_source': {'donationid': '10793364ad4c13b0c67ca8ac49489944', 'projectid': 'c208f9ca9a8d36f04e93c00c000c3a04', 'donor_acctid': 'aa3482a596405f3e10382a213ffbbf90', 'cartid': 'c513c91eefe96e9e419d436fc2f97d04', 'donor_city': '', 'donor_state': 'NJ', 'donor_zip': 80.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2014-03-22T15:27:25.065000', 'donation_to_project': 8.5, 'donation_optional_support': 1.5, 'donation_total': 10.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'giving_page_typ

610000 / 6211956
620000 / 6211956
630000 / 6211956
640000 / 6211956
650000 / 6211956
660000 / 6211956
670000 / 6211956
680000 / 6211956
690000 / 6211956
700000 / 6211956
doc: {'_index': 'donorschoose', '_id': '1cd6bb7a4e42bbeee03061dba8732798', '_type': 'donation', '_source': {'donationid': '1cd6bb7a4e42bbeee03061dba8732798', 'projectid': 'e9a83e27ee7aee0dbba772e58e3d976f', 'donor_acctid': '5ad8fea799715ea37bb5a50366be6394', 'cartid': 'a74b0ce6fb2d1074eb92ba9543bfe881', 'donor_city': '', 'donor_state': '', 'donor_zip': '', 'is_teacher_acct': 'f', 'donation_timestamp': '2010-12-05T17:54:41.957000', 'donation_to_project': 12.75, 'donation_optional_support': 2.25, 'donation_total': 15.0, 'donation_included_optional_support': 't', 'payment_method': 'no_cash_received', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 't', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'giving_pag

910000 / 6211956
920000 / 6211956
930000 / 6211956
940000 / 6211956
950000 / 6211956
960000 / 6211956
970000 / 6211956
980000 / 6211956
990000 / 6211956
1000000 / 6211956
doc: {'_index': 'donorschoose', '_id': '292b4376fd79143b012da75d0a24acbb', '_type': 'donation', '_source': {'donationid': '292b4376fd79143b012da75d0a24acbb', 'projectid': 'b9f505902b6d854a0515d93c21448fcb', 'donor_acctid': 'b83ea08f81ad5abeeaf37df806f1925a', 'cartid': '', 'donor_city': '', 'donor_state': 'CA', 'donor_zip': '', 'is_teacher_acct': 'f', 'donation_timestamp': '2014-01-13T16:41:01.118000', 'donation_to_project': 42.5, 'donation_optional_support': 7.5, 'donation_total': 50.0, 'donation_included_optional_support': 't', 'payment_method': 'promo_code_match', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 't', 'giving_page_id': 'a52977c4796b761e1d361cad585d89c1', 'giving_pa

1210000 / 6211956
1220000 / 6211956
1230000 / 6211956
1240000 / 6211956
1250000 / 6211956
1260000 / 6211956
1270000 / 6211956
1280000 / 6211956
1290000 / 6211956
1300000 / 6211956
doc: {'_index': 'donorschoose', '_id': '35935cee85f724883e2e5622db7cbfd2', '_type': 'donation', '_source': {'donationid': '35935cee85f724883e2e5622db7cbfd2', 'projectid': '14ac77393b578431a9816adbd517bf80', 'donor_acctid': '58cb8004888d0271221a465a9ea87686', 'cartid': '9f02c69e8aebf3f9298cc01faf152bea', 'donor_city': '', 'donor_state': 'TX', 'donor_zip': 773.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2012-09-17T09:02:13.749000', 'donation_to_project': 27.39, 'donation_optional_support': 4.83, 'donation_total': 32.22, 'donation_included_optional_support': 't', 'payment_method': 'paypal', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 't', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'givi

1510000 / 6211956
1520000 / 6211956
1530000 / 6211956
1540000 / 6211956
1550000 / 6211956
1560000 / 6211956
1570000 / 6211956
1580000 / 6211956
1590000 / 6211956
1600000 / 6211956
doc: {'_index': 'donorschoose', '_id': '41e32600834831fdc51834273c26b834', '_type': 'donation', '_source': {'donationid': '41e32600834831fdc51834273c26b834', 'projectid': 'ddacd961721d6382bddedc0da3c0facf', 'donor_acctid': 'f0fdc20cfc6bcef698da4cd1f66f74bd', 'cartid': '146dd6c1fa9ed75cffe8ef74fcda6074', 'donor_city': '', 'donor_state': 'MA', 'donor_zip': 27.0, 'is_teacher_acct': 't', 'donation_timestamp': '2015-01-30T07:48:23.523000', 'donation_to_project': 8.5, 'donation_optional_support': 1.5, 'donation_total': 10.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'givin

1810000 / 6211956
1820000 / 6211956
1830000 / 6211956
1840000 / 6211956
1850000 / 6211956
1860000 / 6211956
1870000 / 6211956
1880000 / 6211956
1890000 / 6211956
1900000 / 6211956
doc: {'_index': 'donorschoose', '_id': '4e4e5e12000ed6a9d17e8498920993d1', '_type': 'donation', '_source': {'donationid': '4e4e5e12000ed6a9d17e8498920993d1', 'projectid': '2a9c3b6d0ee57adc910a12bd4751b8b6', 'donor_acctid': '67641350d3f37062290f509011c37957', 'cartid': '', 'donor_city': '', 'donor_state': '  ', 'donor_zip': 100.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2016-06-30T18:17:10.274000', 'donation_to_project': 21.25, 'donation_optional_support': 3.75, 'donation_total': 25.0, 'donation_included_optional_support': 't', 'payment_method': 'promo_code_match', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 't', 'giving_page_id': '1dc44726032850e9b1df0fb2353876a

2110000 / 6211956
2120000 / 6211956
2130000 / 6211956
2140000 / 6211956
2150000 / 6211956
2160000 / 6211956
2170000 / 6211956
2180000 / 6211956
2190000 / 6211956
2200000 / 6211956
doc: {'_index': 'donorschoose', '_id': '5aa37b86142f7872796b83e23fb4284a', '_type': 'donation', '_source': {'donationid': '5aa37b86142f7872796b83e23fb4284a', 'projectid': 'a9dd517931efc43756c9fa6d71ca5a43', 'donor_acctid': '406518edf1c30d599de90e2055a697fc', 'cartid': 'f1af8ff7569a0bf6721087cb73f3b892', 'donor_city': 'Columbus', 'donor_state': 'GA', 'donor_zip': 319.0, 'is_teacher_acct': 't', 'donation_timestamp': '2016-06-19T10:17:38.222000', 'donation_to_project': 0.85, 'donation_optional_support': 0.15, 'donation_total': 1.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': 

2410000 / 6211956
2420000 / 6211956
2430000 / 6211956
2440000 / 6211956
2450000 / 6211956
2460000 / 6211956
2470000 / 6211956
2480000 / 6211956
2490000 / 6211956
2500000 / 6211956
doc: {'_index': 'donorschoose', '_id': '6704b06c5160695103ed5aef60eb330c', '_type': 'donation', '_source': {'donationid': '6704b06c5160695103ed5aef60eb330c', 'projectid': 'e49094a309bdfe4bef2a57c2de227f9b', 'donor_acctid': '747d8aba0bcd86508f5ee21e65bd225f', 'cartid': '65e24809cdec5f4e3bfa45631dabfd50', 'donor_city': '', 'donor_state': 'PA', 'donor_zip': 190.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2015-01-20T15:11:07.294000', 'donation_to_project': 63.75, 'donation_optional_support': 11.25, 'donation_total': 75.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 't', 'is_teacher_referred': 't', 'giving_page_id': '8a36

2710000 / 6211956
2720000 / 6211956
2730000 / 6211956
2740000 / 6211956
2750000 / 6211956
2760000 / 6211956
2770000 / 6211956
2780000 / 6211956
2790000 / 6211956
2800000 / 6211956
doc: {'_index': 'donorschoose', '_id': '7361c911f295c0942ec9fc521afa6de2', '_type': 'donation', '_source': {'donationid': '7361c911f295c0942ec9fc521afa6de2', 'projectid': '75b4ca721c1728738d9867a2fa4925c2', 'donor_acctid': 'f016d149d1910994193078c4cce1c9eb', 'cartid': '', 'donor_city': '', 'donor_state': 'NY', 'donor_zip': '', 'is_teacher_acct': 'f', 'donation_timestamp': '2013-06-11T00:30:25.236000', 'donation_to_project': 11.75, 'donation_optional_support': 2.07, 'donation_total': 13.82, 'donation_included_optional_support': 't', 'payment_method': 'promo_code_match', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 't', 'giving_page_id': '14f86102ea2e96eac30f6d77d79e6363'

3010000 / 6211956
3020000 / 6211956
3030000 / 6211956
3040000 / 6211956
3050000 / 6211956
3060000 / 6211956
3070000 / 6211956
3080000 / 6211956
3090000 / 6211956
3100000 / 6211956
doc: {'_index': 'donorschoose', '_id': '7fc148e410f88813615fdf5ef117b438', '_type': 'donation', '_source': {'donationid': '7fc148e410f88813615fdf5ef117b438', 'projectid': '2a50ef5891fa36ccd142f5313cb7ff0f', 'donor_acctid': '461f3ea4daeb41d07daff40ccd74b680', 'cartid': '5f86269f508a91c7533bb2f7814d2976', 'donor_city': '', 'donor_state': 'NY', 'donor_zip': 100.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2015-04-28T10:04:38.666000', 'donation_to_project': 30.0, 'donation_optional_support': 0.0, 'donation_total': 30.0, 'donation_included_optional_support': 'f', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 't', 'giving_page_id': '869be79

3310000 / 6211956
3320000 / 6211956
3330000 / 6211956
3340000 / 6211956
3350000 / 6211956
3360000 / 6211956
3370000 / 6211956
3380000 / 6211956
3390000 / 6211956
3400000 / 6211956
doc: {'_index': 'donorschoose', '_id': '8c22a7740236573f76b30c22fd59a6d4', '_type': 'donation', '_source': {'donationid': '8c22a7740236573f76b30c22fd59a6d4', 'projectid': '56dc9892a7e834fb18e4ae2df53338e1', 'donor_acctid': 'de254bb1f6d52d4f5f0659210005f52b', 'cartid': 'd16359597b1755265ff08c068589922c', 'donor_city': 'Pendleton', 'donor_state': 'IN', 'donor_zip': 460.0, 'is_teacher_acct': 't', 'donation_timestamp': '2014-06-30T15:16:51.807000', 'donation_to_project': 0.85, 'donation_optional_support': 0.15, 'donation_total': 1.0, 'donation_included_optional_support': 't', 'payment_method': 'paypal', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': 'ee

3610000 / 6211956
3620000 / 6211956
3630000 / 6211956
3640000 / 6211956
3650000 / 6211956
3660000 / 6211956
3670000 / 6211956
3680000 / 6211956
3690000 / 6211956
3700000 / 6211956
doc: {'_index': 'donorschoose', '_id': '987b693e3af451933661f964ba6c863c', '_type': 'donation', '_source': {'donationid': '987b693e3af451933661f964ba6c863c', 'projectid': '5f1a32caa6eca6d4ddf32113c463fffa', 'donor_acctid': 'f3d3ad3181723c84fd1df825ca7f995e', 'cartid': '924110c006119c6f74b77d90bd07abcc', 'donor_city': '', 'donor_state': 'FL', 'donor_zip': 327.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2015-12-28T23:15:59.316000', 'donation_to_project': 88.0, 'donation_optional_support': 12.0, 'donation_total': 100.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 't', 'is_teacher_referred': 't', 'giving_page_id': '5d4fc

3910000 / 6211956
3920000 / 6211956
3930000 / 6211956
3940000 / 6211956
3950000 / 6211956
3960000 / 6211956
3970000 / 6211956
3980000 / 6211956
3990000 / 6211956
4000000 / 6211956
doc: {'_index': 'donorschoose', '_id': 'a4d82024db059bbb2f8ff5116e4ccad6', '_type': 'donation', '_source': {'donationid': 'a4d82024db059bbb2f8ff5116e4ccad6', 'projectid': '4b92ea7cf5f015dbd652827e2b3910dc', 'donor_acctid': 'f3213e76cf6162b8094d9aaabb531fc3', 'cartid': '2f186b9a390fe4bec6e43c251eb1141c', 'donor_city': '', 'donor_state': '', 'donor_zip': '', 'is_teacher_acct': 'f', 'donation_timestamp': '2015-09-15T16:57:24.454000', 'donation_to_project': 8.5, 'donation_optional_support': 1.5, 'donation_total': 10.0, 'donation_included_optional_support': 't', 'payment_method': 'no_cash_received', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 't', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'giv

4210000 / 6211956
4220000 / 6211956
4230000 / 6211956
4240000 / 6211956
4250000 / 6211956
4260000 / 6211956
4270000 / 6211956
4280000 / 6211956
4290000 / 6211956
4300000 / 6211956
doc: {'_index': 'donorschoose', '_id': 'b1344a2b5013c9f7f278a84cf54a6a3d', '_type': 'donation', '_source': {'donationid': 'b1344a2b5013c9f7f278a84cf54a6a3d', 'projectid': 'e34e0f2ecc7e6a07e085a43274fd6e73', 'donor_acctid': '32b31324b896249836a291aef2bb5ca2', 'cartid': 'ae3bacf7a66329a238090cccf4de8952', 'donor_city': 'Gainesville', 'donor_state': 'FL', 'donor_zip': 326.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2008-09-10T22:26:37.669000', 'donation_to_project': 85.0, 'donation_optional_support': 15.0, 'donation_total': 100.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_

4510000 / 6211956
4520000 / 6211956
4530000 / 6211956
4540000 / 6211956
4550000 / 6211956
4560000 / 6211956
4570000 / 6211956
4580000 / 6211956
4590000 / 6211956
4600000 / 6211956
doc: {'_index': 'donorschoose', '_id': 'bd99253b95f64014d8b69476c2f7ac8a', '_type': 'donation', '_source': {'donationid': 'bd99253b95f64014d8b69476c2f7ac8a', 'projectid': 'ee859fabb7df24dd62dd54b4dae85cbf', 'donor_acctid': '84b75adba707c922c84c716eccac61ad', 'cartid': 'eb643cbfb5ff80907b24c37c3d2fa910', 'donor_city': 'Madison', 'donor_state': 'WI', 'donor_zip': 537.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2014-08-10T23:13:43.191000', 'donation_to_project': 17.0, 'donation_optional_support': 3.0, 'donation_total': 20.0, 'donation_included_optional_support': 't', 'payment_method': 'paypal', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', '

4810000 / 6211956
4820000 / 6211956
4830000 / 6211956
4840000 / 6211956
4850000 / 6211956
4860000 / 6211956
4870000 / 6211956
4880000 / 6211956
4890000 / 6211956
4900000 / 6211956
doc: {'_index': 'donorschoose', '_id': 'c9f2aad0c611a50441c477d276708c23', '_type': 'donation', '_source': {'donationid': 'c9f2aad0c611a50441c477d276708c23', 'projectid': 'f7b95050160c4704bc668915697f7e4f', 'donor_acctid': 'ffb35f055e5a9187f83ca934fee4bc83', 'cartid': 'e8133821f97bb1159ef1159b6699e19d', 'donor_city': '', 'donor_state': '', 'donor_zip': '', 'is_teacher_acct': 't', 'donation_timestamp': '2016-01-31T19:57:51.675000', 'donation_to_project': 25.0, 'donation_optional_support': 0.0, 'donation_total': 25.0, 'donation_included_optional_support': 'f', 'payment_method': 'no_cash_received', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 't', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 't', 'giving_page_id': '5aae76

5110000 / 6211956
5120000 / 6211956
5130000 / 6211956
5140000 / 6211956
5150000 / 6211956
5160000 / 6211956
5170000 / 6211956
5180000 / 6211956
5190000 / 6211956
5200000 / 6211956
doc: {'_index': 'donorschoose', '_id': 'd64bd2eca28895aabc3505836a9ffeaf', '_type': 'donation', '_source': {'donationid': 'd64bd2eca28895aabc3505836a9ffeaf', 'projectid': '3ad0fd632049ac494afb9ddfc1fa13a9', 'donor_acctid': '254ca33fd4f784b0b8d96e78702f9442', 'cartid': 'e72bd62211bbacb9dde973bec9894ea0', 'donor_city': '', 'donor_state': 'NC', 'donor_zip': 278.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2016-03-08T22:41:22.673000', 'donation_to_project': 42.5, 'donation_optional_support': 7.5, 'donation_total': 50.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'giv

5410000 / 6211956
5420000 / 6211956
5430000 / 6211956
5440000 / 6211956
5450000 / 6211956
5460000 / 6211956
5470000 / 6211956
5480000 / 6211956
5490000 / 6211956
5500000 / 6211956
doc: {'_index': 'donorschoose', '_id': 'e2ab60a3813149051b7529765c89daab', '_type': 'donation', '_source': {'donationid': 'e2ab60a3813149051b7529765c89daab', 'projectid': '9c3dab1de4b67895aad996211c2dd795', 'donor_acctid': 'bbbc66bb1236ed7ad271936f060e06e3', 'cartid': 'b927fa804df12add59a8e4e3e4b7b99e', 'donor_city': '', 'donor_state': 'CA', 'donor_zip': '', 'is_teacher_acct': 'f', 'donation_timestamp': '2012-01-11T17:42:30.997000', 'donation_to_project': 12.75, 'donation_optional_support': 2.25, 'donation_total': 15.0, 'donation_included_optional_support': 't', 'payment_method': 'no_cash_received', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 't', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '',

5710000 / 6211956
5720000 / 6211956
5730000 / 6211956
5740000 / 6211956
5750000 / 6211956
5760000 / 6211956
5770000 / 6211956
5780000 / 6211956
5790000 / 6211956
5800000 / 6211956
doc: {'_index': 'donorschoose', '_id': 'ef054743b1f25d5aeab9a76e22df2116', '_type': 'donation', '_source': {'donationid': 'ef054743b1f25d5aeab9a76e22df2116', 'projectid': 'c1e56106d41d0cbf740b88a7e6f52c16', 'donor_acctid': 'ddf05927ed30b1076e2254235796ba9c', 'cartid': '', 'donor_city': 'West Berlin', 'donor_state': 'NJ', 'donor_zip': 80.0, 'is_teacher_acct': 'f', 'donation_timestamp': '2012-04-04T22:06:58.097000', 'donation_to_project': 158.47, 'donation_optional_support': 27.97, 'donation_total': 186.44, 'donation_included_optional_support': 't', 'payment_method': 'double_your_impact_match', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'givin

6010000 / 6211956
6020000 / 6211956
6030000 / 6211956
6040000 / 6211956
6050000 / 6211956
6060000 / 6211956
6070000 / 6211956
6080000 / 6211956
6090000 / 6211956
6100000 / 6211956
doc: {'_index': 'donorschoose', '_id': 'fb62e8f93553201539b25ad7bdcf77cb', '_type': 'donation', '_source': {'donationid': 'fb62e8f93553201539b25ad7bdcf77cb', 'projectid': 'a6e2c63824fa46c60886440ca6e5192b', 'donor_acctid': '7c5b55499e42127ee1f950c5d133622c', 'cartid': 'f077fc3352e1880e17ace3a73a320ef2', 'donor_city': '', 'donor_state': '', 'donor_zip': '', 'is_teacher_acct': 'f', 'donation_timestamp': '2008-12-14T15:42:32.306000', 'donation_to_project': 42.5, 'donation_optional_support': 7.5, 'donation_total': 50.0, 'donation_included_optional_support': 't', 'payment_method': 'creditcard', 'payment_included_acct_credit': 'f', 'payment_included_campaign_gift_card': 'f', 'payment_included_web_purchased_gift_card': 'f', 'payment_was_promo_matched': 'f', 'is_teacher_referred': 'f', 'giving_page_id': '', 'giving_p

### standard bulk 

In [None]:
### Index Data into Elasticsearch
print("Indexing bulk")
start = perf_counter()
helpers.bulk(es,read_data(data), index=index_name,doc_type=doc_name)
end = perf_counter()
print(end - start)

### Pool Execution
run both of these

In [None]:
# only run this if using the pool
print("Chunking for pool")
start = perf_counter()
# create as many processes as there are CPUs on your machine - leave one for everyone else
num_partitions = multiprocessing.cpu_count() - 1
num_partitions = 8
chunks = np.array_split(data, num_partitions)
print('chunk count {}'.format(len(chunks)))
end = perf_counter()
print(end - start)


In [None]:
### Index Data into Elasticsearch - pool approach with `num_partitions` processes
from multiprocessing import Pool
from elasticsearch import Elasticsearch
pool = Pool(processes=num_partitions)

def es_pool_func(aChunk):
    print('chunked es bulk : {}'.format(len(aChunk)))
    es = Elasticsearch()
    helpers.bulk(es,read_data(aChunk), index=index_name,doc_type=doc_name)

print("Indexing Chunked")
start = perf_counter()
# apply our function to each chunk in the list
# with multiprocessing.Pool(processes=num_partitions) as pool:
#     result = pool.map(es_pool_func, chunks)

pool = Pool(num_partitions)
for aChunk in chunks:
    pool.apply_async(es_pool_func, args=(aChunk,))
pool.close()
pool.join()

end = perf_counter()
print(end - start)

# playground blocks used to try out various cluster API

In [None]:
res = es.update_by_query(index=index_name,doc_type=doc_name, request_timeout=30.0, )