# Extraction Using OpenAI functions

In this notebook I process and structure a whatsapp conversation from the Morningside Community whatsapp group from 22 November 2023.

In [None]:
!python --version

In [None]:
!pip show openai

In [1]:
# Set up the environment and load the required modules
import os
import openai
import re
import json

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser, JsonKeyOutputFunctionsParser

## Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information. 
In this notebook, extraction can be considered as a preprocessing step as we don't interrogate the complete chat text, but rather a filtered version of it.

In [3]:
# Set up the Person and Information Classes. These inherit from the python Base Model
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    message: str = Field(description="a brief summary of the person's message") 
    sentiment: str = Field(description="sentiment of message, should be `pos`, `neg`, or `neutral`")
 

In [4]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [5]:
# Convert pydantic to openai functions. 
# This is just to illustrate what happens under the hood
convert_pydantic_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'title': 'Information',
  'description': 'Information to extract.',
  'type': 'object',
  'properties': {'people': {'title': 'People',
    'description': 'List of info about people',
    'type': 'array',
    'items': {'title': 'Person',
     'description': 'Information about a person.',
     'type': 'object',
     'properties': {'name': {'title': 'Name',
       'description': "person's name",
       'type': 'string'},
      'message': {'title': 'Message',
       'description': "a brief summary of the person's message",
       'type': 'string'},
      'sentiment': {'title': 'Sentiment',
       'description': 'sentiment of message, should be `pos`, `neg`, or `neutral`',
       'type': 'string'}},
     'required': ['name', 'message', 'sentiment']}}},
  'required': ['people']}}

In [6]:
# Declare the language model
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")

In [7]:
# Set up the extraction functions and the extraction model
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [8]:
# Define the prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. \
    You can extract partial info."),
    ("human", "{input}")
])

In [9]:
#Set up the extraction chain
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [10]:
# Define the batch size for processing the chat
batch_size = 10  # Adjust batch size as needed

Proces the chat in batches

The script also checks each line to determine whether it's the start of a new message (using the regular expression pattern). If a line doesn't match the pattern, it's considered a continuation of the previous message. This way, the script can handle messages that span multiple lines, preventing them from being truncated or split incorrectly between batches.

In [None]:
def parse_line(line):
    pattern = r"\[(\d{4}/\d{2}/\d{2}), (\d{2}:\d{2}:\d{2})\] ([^:]+): (.*)"
    match = re.match(pattern, line)
    if match:
        return {
            "date": match.group(1),
            "time": match.group(2),
            "sender": match.group(3).strip(),
            "message": match.group(4).strip()
        }
    else:
        return None

def process_batch(batch):
    parsed_batch = [parse_line(line) for line in batch if line.strip()]
    return [line for line in parsed_batch if line]  # Remove None entries

aggregated_results = []

with open('community_chat.txt', 'r', encoding='utf-8') as file:
    batch = []
    previous_line = ""
    for line in file:
        if re.match(r"\[\d{4}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]", line):
            # Start of a new message
            if previous_line:  # If there's a previous line, add it to the batch
                batch.append(previous_line)
            previous_line = line
        else:
            # Continuation of a previous message
            previous_line += line

        if len(batch) >= batch_size:
            parsed_batch = process_batch(batch)
            result = extraction_chain.invoke({"input": parsed_batch})
            aggregated_results.extend(result)
            batch = []

    if previous_line:  # Add the last line if it exists
        batch.append(previous_line)

    if batch:  # Process the final batch
        parsed_batch = process_batch(batch)
        result = extraction_chain.invoke({"input": parsed_batch})
        aggregated_results.extend(result)

# Convert the aggregated_results to JSON 
json_string = json.dumps(aggregated_results, indent=4)

# Save the JSON string output to a file
with open('chat.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_string)


In [14]:
#Optional: In case of timeout, save whatever you've got so far
# Convert the aggregated_results to JSON 
json_string = json.dumps(aggregated_results, indent=4)

# Save the JSON string output to a file
with open('community_chat.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_string)

In [13]:
# Print the string in nice json
# First convert it to a Python object (list or dictionary)
community_json_data=json.loads(json_string)
community_json_data

[{'name': 'Jenny Wertheimer',
  'message': 'created this group',
  'sentiment': 'neutral'},
 {'name': 'Scott', 'message': 'added you', 'sentiment': 'neutral'},
 {'name': 'Scott', 'message': '👍', 'sentiment': 'pos'},
 {'name': 'Michelle & Giuseppe',
  'message': 'Scott added Michelle & Giuseppe',
  'sentiment': 'neutral'},
 {'name': 'Giuseppe',
  'message': 'Hi Scott please add Mish 0824565294',
  'sentiment': 'neutral'},
 {'name': 'John S',
  'message': 'Ref the "noise disturbances" from the Park and surrounds, I will 100% support on behalf of Morningside Villas - it is not only the Park, but also "inconsiderate" neighboring properties as flagged the past while......',
  'sentiment': 'neg'},
 {'name': 'Scott',
  'message': "Thanks All, I will let you know when I've set up the meeting if anyone else wants to attend..",
  'sentiment': 'pos'},
 {'name': 'John S',
  'message': 'Thanks Scott - based on complaints as raised lately, it is not so much the Park as Private Properties that do not

In [16]:
#Calculate sentiment percentages

# To calculate the percentage of each sentiment category, we first need to count the number of messages
# in each category and then calculate the percentage based on the total number of messages.

# Counting the sentiments
sentiment_counts = {'pos': 0, 'neutral': 0, 'neg': 0}
for item in community_json_data:
    sentiment_counts[item['sentiment']] += 1

# Total number of messages
total_messages = len(community_json_data)

# Calculating the percentages
sentiment_percentages = {sentiment: (count / total_messages) * 100 for sentiment, count in sentiment_counts.items()}

sentiment_percentages



{'pos': 27.384324834749762,
 'neutral': 52.40793201133145,
 'neg': 20.20774315391879}

In [28]:
# who posted the most often?
# To calculate who posted the most often, we need to count the number of messages each person sent.

# Counting the number of messages each person sent
message_counts = {}
for item in community_json_data:
    if item['name'] in message_counts:
        message_counts[item['name']] += 1
    else:
        message_counts[item['name']] = 1

# Identifying the person who posted the most often
most_frequent_poster = max(message_counts, key=message_counts.get)
most_frequent_count = message_counts[most_frequent_poster]

most_frequent_poster, most_frequent_count



('Scott', 60)

In [29]:
#who posted posted the most positive messages by number
# To calculate who posted positive messages the most often, we need to count the number of positive messages each person sent.

# Counting the number of positive messages each person sent
positive_message_counts = {}
for item in community_json_data:
    if item['sentiment'] == 'pos':
        if item['name'] in positive_message_counts:
            positive_message_counts[item['name']] += 1
        else:
            positive_message_counts[item['name']] = 1

# Identifying the person who posted the most positive messages
most_positive_poster = max(positive_message_counts, key=positive_message_counts.get, default=None)
most_positive_count = positive_message_counts.get(most_positive_poster, 0)

most_positive_poster, most_positive_count



('Michelle & Giuseppe', 18)

In [30]:
#Who posted the most negative messages by number?
# To calculate who posted negative messages the most often, we need to count the number of negative messages each person sent.

# Counting the number of negative messages each person sent
negative_message_counts = {}
for item in community_json_data:
    if item['sentiment'] == 'neg':
        if item['name'] in negative_message_counts:
            negative_message_counts[item['name']] += 1
        else:
            negative_message_counts[item['name']] = 1

# Identifying the person who posted the most negative messages
most_negative_poster = max(negative_message_counts, key=negative_message_counts.get, default=None)
most_negative_count = negative_message_counts.get(most_negative_poster, 0)

most_negative_poster, most_negative_count



('Michelle & Giuseppe', 17)

In [None]:
# Optional: Print page by page

def paginate(iterable, page_size):
    page = []
    for item in iterable:
        page.append(item)
        if len(page) == page_size:
            yield page
            page = []
    if page:
        yield page

# Example usage
for page in paginate(community_json_data, 10):  # Adjust page size as needed
    print(json.dumps(page, indent=4))
    input("Press Enter to continue to the next page...")  # Pause between pages


Now ask questions of the data

In [17]:
#create get_completion helper function
def get_completion(prompt,model="gpt-4-1106-preview"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature =0, #this is the degree of randomness 
    )
    return response.choices[0].message["content"]

In [25]:
# Ask the user to input a question
question = input("Please ask your question: ")

Please ask your question: who posted positive messages the nost often?


In [26]:
# Define a prompt
prompt = f"""
The JSON file delimited with triple backticks is taken from a Whatsapp channel for\
a neighbourhhood community group.\
Answer the question '''{question}''' by referring to the JSON file.\

Community responses: '''{community_json_data}'''
"""

In [None]:
# Get the completion
response = get_completion(prompt)
print(response)

In [None]:
# Optional: Define a function to print all the available info for a given name
def print_info_for_name(name_to_find, json_data):
    for record in community_json_data:
        if record['name'] == name_to_find:
            print(f"Information for {name_to_find}:")
            print("----------------------------------------")
            for key, value in record.items():
                print(f"{key}: {value}")


In [None]:
# Example usage
name_to_search = "James Drage"
print_info_for_name(name_to_search, json_data)
