In [0]:
%pip install Pdfplumber langchain
dbutils.library.restartPython()

In [0]:
import os
from pyspark.sql.functions import substring_index
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import ArrayType, StringType
import pandas as pd
from pyspark.sql.functions import col

In [0]:
%sql
CREATE TABLE IF NOT EXISTS datascience_dev.default.docs_text ( id BIGINT GENERATED BY DEFAULT AS IDENTITY, text STRING ) tblproperties (delta.enableChangeDataFeed = true);

In [0]:
%sql
CREATE TABLE IF NOT EXISTS datascience_dev.default.docs_track (file_name STRING) tblproperties (delta.enableChangeDataFeed = true);

In [0]:
# Directory path
directory_path = "/Volumes/datascience_dev/default/raw-data-hackathon"

# List files in directory
file_paths = [file.path for file in dbutils.fs.ls(directory_path)]

# Extract file names from paths
df = spark.createDataFrame(file_paths, "string").select(substring_index("value", "/", -1).alias("file_name"))

# Show dataframe
df.show()

+---------------+
|      file_name|
+---------------+
|       ebs2.pdf|
|      homey.pdf|
|immigration.pdf|
|     ocwage.pdf|
|     tenure.pdf|
+---------------+



In [0]:
pdf_volume_path = "/Volumes/datascience_dev/default/raw-data-hackathon"  # Specify the path to the PDF volume directory

# Get the list of already processed PDF files from the Delta table
processed_files = spark.sql(f"SELECT DISTINCT file_name FROM datascience_dev.default.docs_track").collect()
processed_files = set(row["file_name"] for row in processed_files)

# Process only new PDF files
new_files = [file for file in os.listdir(pdf_volume_path) if file not in processed_files]

all_text = ''  # Initialize all_text to store text from new PDF files

for file_name in new_files:
    # Extract text from the PDF file
    pdf_path = os.path.join(pdf_volume_path, file_name)

    with pdfplumber.open(pdf_path) as pdf:
        for pdf_page in pdf.pages:
            single_page_text = pdf_page.extract_text()
            # Separate each page's text with newline
            all_text = all_text + '\n' + single_page_text

# Split the combined text into chunks using the RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

length_function = len

splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=1000,
    chunk_overlap=200,
    length_function=length_function,
)
chunks = splitter.split_text(all_text)

In [0]:

@pandas_udf("array<string>")
def get_chunks(dummy):
    return pd.Series([chunks])

# Register the UDF
spark.udf.register("get_chunks_udf", get_chunks)

<function __main__.get_chunks(dummy)>

In [0]:
%sql
insert into datascience_dev.default.docs_text (text)
select explode(get_chunks_udf('dummy')) as text;

num_affected_rows,num_inserted_rows
514,514


In [0]:
df.createOrReplaceTempView("temp_table")  # Create a temporary table from the DataFrame

# Insert only the rows that do not exist in the target table
spark.sql("""
    INSERT INTO datascience_dev.default.docs_track
    SELECT * FROM temp_table
    WHERE NOT EXISTS (
        SELECT 1 FROM datascience_dev.default.docs_track
        WHERE temp_table.file_name = datascience_dev.default.docs_track.file_name
    )
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
df_docs_text = spark.table("datascience_dev.default.docs_text")
display(df_docs_text)

id,text
1,"For release 10:00 a.m. (ET) Thursday, September 19, 2024 USDL-24-1921 Technical information: (202) 691-6199 • ncsinfo@bls.gov • www.bls.gov/ebs Media contact: (202) 691-5902 • pressoffice@bls.gov EMPLOYEE BENEFITS IN THE UNITED STATES – MARCH 2024 Medical care benefits were available to 72 percent of private industry workers and 89 percent of state and local government workers in March 2024, the U.S. Bureau of Labor Statistics reported today. Forty- three percent of private industry workers had access to dental benefits and 60 percent of state and local government workers had access. Twenty-eight percent of private industry workers had access to vision benefits while 39 percent of state and local government workers had access. (See chart 1 and table 2.) For private industry workers participating in medical plans with single coverage, the employer share of premiums was 80 percent and the employee share was 20 percent. State and local government workers"
2,"premiums was 80 percent and the employee share was 20 percent. State and local government workers participating in medical plans with single coverage saw 86 percent of premiums covered by employers and 14 percent by employees. For family coverage, employers paid 68 percent of premiums for private industry workers and 71 percent for state and local government workers. Thirty-two percent of premiums were funded by the employee for family coverage among private industry workers. For state and local government workers, employees covered 29 percent of such plans. (See chart 2 and tables 3 and 4.) Chart 1. Percent of workers with access to Chart 2. Share of medical care premiums, March healthcare benefits, March 2024 2024 100% 100% 80% 80% 60% 60% 40% 40% 20% 20% 0% Private State and Private State and 0% industry local industry local workers government workers government workers workers Single coverage Family coverage Private industry workers"
3,"40% 40% 20% 20% 0% Private State and Private State and 0% industry local industry local workers government workers government workers workers Single coverage Family coverage Private industry workers State and local government workers Employer Employee Seventy-nine percent of private industry workers and 92 percent of state and local government workers had access to paid sick leave. Eighty-one percent of private industry workers and 68 percent of state and 1 local government workers had access to paid holidays. Paid personal leave is defined as a leave benefit, used for reasons important to the individual employee, but not otherwise provided by other forms of leave (for example, sick leave, vacations, and holidays). This benefit was available to 49 percent of private industry workers and 62 percent of state and local government workers. (See chart 3 and table 6.) For private industry workers in establishments with 1 to 99 employees, the average number of annual"
4,"For private industry workers in establishments with 1 to 99 employees, the average number of annual paid vacation days for workers was 10 days after 1 year of employment, 14 days after 5 years, 16 days after 10 years, and 17 days after 20 years. For private industry workers in establishments with 500 or more employees, the average number of annual paid vacation days for workers was 14 days after 1 year of employment, 18 days after 5 years, 21 days after 10 years, and 24 days after 20 years. (See chart 4.) Chart 3. Percent of workers with access to select Chart 4. Average number of annual paid vacation paid leave benefits by ownership group, March days by service requirement and establishment 2024 size, March 2024 30 100% 80% 20 60% 10 40% 20% 0 0% 1 to 99 500 or more 1 to 99 500 or more Paid sick Leave Paid holidays Paid personal workers workers workers workers leave Private industry workers State and local Private industry workers government workers After 1 year After 5 years"
5,"Paid sick Leave Paid holidays Paid personal workers workers workers workers leave Private industry workers State and local Private industry workers government workers After 1 year After 5 years State and local government workers In the West Census region, 53 percent of private industry workers and 81 percent of state and local government workers had access to life insurance. Long-term disability insurance was available to 34 percent of private industry workers and 48 percent of state and local government workers in this region. Thirty-six percent of private industry workers and 30 percent of state and local government workers had access to short-term disability insurance in the West Census region. (See chart 5 and table 5.) Chart 5. Percent of workers with access to insurance benefits by census region, March 2024 Long-term disability Short-term disability Life Long-term disability Short-term disability Life Long-term disability Short-term disability Life Long-term disability"
6,"Long-term disability Short-term disability Life Long-term disability Short-term disability Life Long-term disability Short-term disability Life Long-term disability Short-term disability Life 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% tseW tsewdiM htuoS tsaehtroN Private industry workers State and local government workers 2 Highlights of employer-sponsored benefits Occupational group • Access to medical care benefits ranged from 52 percent for service occupations to 94 percent for management, business, and financial occupations among civilian workers. • Eighty-three percent of teachers had access to medical care benefits and 60 percent participated among civilian workers. • Among state and local government workers, 92 percent of teachers had access to medical care benefits and the take-up rate for these benefits was 74 percent. Work status • Eighty-nine percent of full-time civilian workers had access to medical care benefits and 26"
7,"benefits and the take-up rate for these benefits was 74 percent. Work status • Eighty-nine percent of full-time civilian workers had access to medical care benefits and 26 percent of part-time workers had access to medical care benefits. The take-up rate was 66 percent for full-time workers and 46 percent for part-time workers. • Ninety-nine percent of full-time state and local government workers had access to retirement benefits and 89 percent participated, while 43 percent of part-time workers had access to retirement benefits and 37 percent participated. • In the private industry, 87 percent of full-time workers had access to medical care benefits and the take-up rate was 65 percent. Twenty-six percent of part-time workers had access to these benefits and the take-up rate was 45 percent. Industry group • Eighty-five percent of workers in goods-producing industries had access to medical care benefits"
8,"benefits and the take-up rate was 45 percent. Industry group • Eighty-five percent of workers in goods-producing industries had access to medical care benefits and 70 percent of workers in service-providing industries had access among private industry workers. • Within education and health services, 75 percent of workers had access to medical care benefits and 44 percent of workers participated in such benefits for private industry workers. • Eighty-nine percent of state and local government workers in service-providing industries had access to medical care benefits and the take-up rate was 75 percent. Establishment size • Access to medical care benefits among all civilian workers ranged from 56 percent for establishments with less than 50 workers to 91 percent for establishments with 500 workers or more. • Take-up rates among all civilian workers were 60 percent for establishments with less than 50"
9,"establishments with less than 50 workers to 91 percent for establishments with 500 workers or more. • Take-up rates among all civilian workers were 60 percent for establishments with less than 50 workers, 60 percent for establishments with 50 to 99 workers, 68 percent for establishments with 100 or more workers, 66 percent for establishments with 100 to 499 workers, and 70 percent for establishments with 500 workers or more. Census area • Medical care benefits access among private industry workers by Census region was 73 percent in the Northeast, 71 percent in the Midwest, 71 percent in the South, and 74 percent in the West. Take-up rates for medical care benefits ranged from 61 percent in the South to 66 percent in the Midwest. • Breaking these numbers down further by Census division, access to medical care benefits in the Pacific division was 76 percent for private industry workers. The access rate was 68 percent for workers in the East South Central division. 3 TECHNICAL NOTE"
10,"Pacific division was 76 percent for private industry workers. The access rate was 68 percent for workers in the East South Central division. 3 TECHNICAL NOTE Estimates in this release are from the National Compensation Survey (NCS), conducted by the U.S. Department of Labor, Bureau of Labor Statistics (BLS). The NCS provides comprehensive measures of compensation cost levels and trends and also provides benefits incidence estimates on the percentage of workers with access to and participating in employer-provided benefit plans. Employee Benefits data: The Employee Benefits in the United States, March 2024 includes additional details on the coverage, costs, and provisions of employer-sponsored benefits, and will be published shortly after this news release. See www.bls.gov/ebs/publications/annual-benefits-summary.htm for the latest benefits publications. Historical estimates are also accessible in Excel format at"
