In [None]:
import pandas as pd
import os
import openai
import concurrent.futures
import json

In [None]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
json_schema={"type": "json_schema",
                "json_schema": {
                    "name": "pollution_data",
                    "description": "Extracts relevant data from the 1871 Rivers Pollution Commission Report",
                    "schema": {
                    "type": "object",
                    "properties": {
                        "situated_on_watercourse": {
                        "type": "array",
                        "items": { "type": "string" },
                        "description": "main watercourse that the business is situated on"
                        },
                        "watercourse_is_tributary_of": {
                        "type": "array",
                        "items": { "type": ["string", "null"] },
                        "description": "main watercourse is a tributary of"
                        },
                        "tributary_is_affluent_of": {
                        "type": "array",
                        "items": { "type": ["string", "null"] },
                        "description": "tributary is an affluent of"
                        },
                        "hands": {
                        "type": "array",
                        "items": { "type": "integer" },
                        "description": "number of hands employed by business"
                        },
                        "rateable_values": {
                        "type": "array",
                        "items": { "type": "integer" },
                        "description": "rateable value of business"
                        },
                        "river_silted": {
                        "type": "array",
                        "items": { "type": "boolean" },
                        "description": "whether the river has silted up"
                        },
                        "status_of_river_comments": {
                        "type": "array",
                        "items": { "type": "string" },
                        "description": "comments on the state of the river past and future"
                        },
                        "water_supplies": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "source_of_water": {"type": "string"},
                                "quantity_from_source": {"type": "integer"},
                                        },
                                }
                                     },
                        "water_used_for_condensing": {
                        "type": "array",
                        "items": { "type": "boolean" },
                        "description": "whether water is used for condensing or not"
                        },
                        "total_yearly_water_consumption": {
                        "type": "array",
                        "items": { "type": "integer" },
                        "description": "yearly water consumption reported by business"
                        },
                        "coal_consumption": {
                        "type": "array",
                        "items": { "type": "integer" },
                        "description": "yearly coal consumption in tons"
                        },
                        "landlord": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "business_has_landlord": {"type": "boolean"},
                                "name_of_landlord": {"type": ["string", "null"]},
                                "power_rented_from_landlord": {"type": "boolean"},
                                "water_supplied_by_landlord": {"type": "boolean"}
                                        },
                                }
                                     }, 
                        "affected_by_floods": {
                        "type": "array",
                        "items": { "type": "boolean" },
                        "description": "whether the business is affected by floods"
                        },
                        "uses_dyes": {
                        "type": "array",
                        "items": { "type": "boolean" },
                        "description": "whether the business uses dyes"
                        },
                        "uses_bleaching_materials": {
                        "type": "array",
                        "items": { "type": "boolean" },
                        "description": "whether the business uses bleaching materials"
                        },
                        "dye_wares": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "dye_ware": {"type": "string"},
                                "value": {"type": "integer"},
                                "weight_or_quantity": {"type": "integer"},
                                "unit of measurement": {"type": "string"}
                                        },
                                }
                                     },
                        "dye_wares_total_weight": {
                        "type": "array",
                        "items": { "type": "integer" }
                        },
                        "ashes_use": {
                        "type": "array",
                        "items": { "type": "string" },
                        "description": "what is done with the ashes produced by the business"
                        },
                        "yearly_goods_produced": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "type_of_good_produced": {"type": "string"},
                                "goods_value": {"type": "integer"},
                                "goods_weight": {"type": "integer"},
                                "weight_measured_in": {"type": "string", "enum": ["lbs", "tons", "sacks"]},
                                        },
                                }
                                     },
                        "soap_suds": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "treated_for_recovery_of_grease": {"type": "boolean"},
                                "money_gained": {"type": "integer"}
                                        },
                                }
                                     }, 
                        "liquid_refuse": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "as_soapsuds": {"type": "boolean"},
                                "liquid_refuse_quantity": {"type": "integer"},
                                "quantity_measured_in": {"type": "string", "enum": ["lbs", "tons", "sacks"]},
                                        },
                                }
                                     }, 
                        "depositing_tanks": {
                        "type": "array",
                        "items": { "type": "boolean" }
                        },
                        "waste_wool_weight": {
                        "type": "array",
                        "items": { "type": "object",
                                  "properties": {
                                      "weight": {"type": "integer"},
                                      "quantity_measured_in": {"type": "string", "enum": ["lbs", "tons", "sacks", "cwt"]},
                                  }
                                }
                        },
                        "bleaching_materials": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "bleaching_material_type": {"type": "string"},
                                "quantity": {"type": "integer"},
                                "quantity_measured_in": {"type": "string", "enum": ["lbs", "tons", "sacks", "cwt"]},
                                "value": {"type": "integer"}
                                        },
                                }
                                     }, 
                        "oils": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "oil_type": {"type": "string"},
                                "quantity": {"type": "integer"},
                                "quantity_measured_in": {"type": "string", "enum": ["lbs", "tons", "sacks", "cwt"]},
                                "value": {"type": "integer"},
                                        },
                                }
                                     },
                        "other chemicals": {
                        "type": "array",
                        "items": { "type": "object",
                            "properties": {
                                "chemical": {"type": "string"},
                                "quantity": {"type": "integer"},
                                "quantity_measured_in": {"type": "string", "enum": ["lbs", "tons", "sacks", "cwt"]},
                                "value": {"type": "integer"}
                                        },
                                }
                                     },
                        "nominal_horse_power": {
                        "type": "array",
                        "items": { "type": "integer" }
                        },
                        "excrements": {
                        "type": "array",
                        "items": { "type": "string" }
                        },
                        "excrements_used_for_manure": {
                        "type": "array",
                        "items": { "type": "boolean" }
                        },
                        "suggestions": {
                        "type": "array",
                        "items": { "type": "string" }
                        }
                    }
                    }
            }
            }


In [None]:
def extract_information(cell_text):
    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
            {"role": "system", "content": "You are a helpful assistant with excellent skills in parsing structured data."},
            {"role": "user", "content": f"""From the text, extract the information in JSON format according to the schema provided. For some of these properties, there will be no information available, and you can return a null value. For some of the materials mentioned, weights or values are grouped together, e.g. 'soda, ropes and sacking, 750 tons'. In cases like this, treat 'soda, ropes and sacking' as a single item with its corresponding weight or value. For monetary values, "l." at the end of a number stands for "£".
             Text: {cell_text}"""}
            ],
            max_tokens=2048,
            response_format=json_schema)
        
        response_data = completion.choices[0].message.content
        json_data = json.loads(response_data)

        return json_data

    except Exception as e:
        print(f"Error processing cell: {e}")
        return {} 


df = pd.read_csv('input.csv', encoding='utf-8')

def process_row(cell_text):
    return extract_information(cell_text)

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(process_row, df['reply']))

output_file = 'output.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("Data successfully extracted and saved as JSON.")