In [71]:
import sys
from os.path import abspath, join, dirname, exists
# Smart progress bar reader...
import tqdm
import pandas as pd
import json
from sodapy import Socrata
from elasticsearch import Elasticsearch
from elasticsearch.helpers import streaming_bulk

In [72]:
DATASET_PATH = 'C:/Users/Dean Truong/Jupyter/Elasticsearch/contractors.json'

In [63]:
def download_dataset():
# Downloads the public dataset if not locally downloaded 
# and returns the number of rows are in the .json file.

    if not exists(DATASET_PATH):
        print("JSON file does not exist...")
        print("Downloading dataset from data.lacity.org...")
        
        client = Socrata("data.lacity.org","8YzE0NUTN2pl4gdOcY5fAquLi",username="dtruong8@toromail.csudh.edu",password="Helloworld123.")
        
        results = client.get("yv23-pmwf",select="distinct contractors_business_name", where="applicant_relationship = 'Contractor'", order="contractors_business_name")
        
        # convert to data frame
        df = pd.DataFrame.from_records(results)
        
        df.to_json(r'C:/Users/Dean Truong/Jupyter/Elasticsearch/contractors.json',orient='records')
        print("Dataset downloaded...")
        
        return df.shape[0]

In [58]:
def create_index(client):
    """Creates an index in Elasticsearch if one isn't already there."""
    client.indices.create(
        index="contractors",
        body={
            "settings": {"number_of_shards": 1},
            "mappings": {
                "properties": {
                    "names": {"type": "text"},
                }
            },
        },
        ignore=400,
    )
        
        

In [96]:
def generate_actions():
# Reads the file through csv.DictReader() and for each row
# yields a single document. This function is passed into the bulk()
# helper to create many documents in sequence.
    with open(DATASET_PATH, mode="r") as f:
        data = json.load(f)
        f.close()
        i = 0
        for names in data:
            doc = {
                "_id": i + 1,
                "name": names
            }
            i = i + 1
            print(doc)

In [59]:
def main():
    es = Elasticsearch(['localhost:9200'])
    # print(es.info()) # verification
    
    number_of_docs = download_dataset()
    
    print("Creating index an index...")
    create_index(es)
    
    print("Indexing documents...")
    progress = tqdm.tqdm(unit="docs", total=number_of_docs)
    sucesses = 0
    for ok, action in streaming_bulk(
        es = es, index="contractors", actions=generate_actions(),
    ):
    

In [97]:
generate_actions()

{'_id': 2, 'name': {'contractors_business_name': '007 HEATING & AIR'}}
{'_id': 3, 'name': {'contractors_business_name': '100 PERCENT ELECTRIC'}}
{'_id': 4, 'name': {'contractors_business_name': '101 CONSTRUCTION'}}
{'_id': 5, 'name': {'contractors_business_name': '101 PLUMBING REPAIR SERVICE'}}
{'_id': 6, 'name': {'contractors_business_name': '101 ROOFING & CONSTRUCTION'}}
{'_id': 7, 'name': {'contractors_business_name': '101 SHEET METAL'}}
{'_id': 8, 'name': {'contractors_business_name': '1030 N MOUNTAIN AVENUE #174'}}
{'_id': 9, 'name': {'contractors_business_name': '110 PRO HVAC INC'}}
{'_id': 10, 'name': {'contractors_business_name': '12178 VAN NUYS BLVD'}}
{'_id': 11, 'name': {'contractors_business_name': '1 - 2 - 3 PLUMBING & ROOTER'}}
{'_id': 12, 'name': {'contractors_business_name': '123 RESTORATION SPECIALISTS INC'}}
{'_id': 13, 'name': {'contractors_business_name': '126 NEW CONSTRUCTION INC'}}
{'_id': 14, 'name': {'contractors_business_name': '129 BUILDERS INC'}}
{'_id': 15, 