# Libraries / Configurations

In [1]:
import os
import openai
import json
import pandas as pd

from dotenv import load_dotenv, find_dotenv
from tqdm.auto import tqdm

# Load .env variables
load_dotenv(find_dotenv())

openai.organization  = os.getenv('OPENAI_ORGANIZATION_ID')
openai.api_key = os.getenv('OPENAI_API_KEY')

# Execution for Book Summaries

Books to extract features:

In [2]:
book_summaries = [
    "In 'Pride and Prejudice' by Jane Austen, Elizabeth Bennet and Mr. Darcy navigate societal pressures and find love.",
    "J.K. Rowling's 'Harry Potter and the Sorcerer's Stone' introduces us to a young wizard discovering his identity.",
    "In 'To Kill a Mockingbird' by Harper Lee, Scout Finch witnesses racial injustice in her hometown.",
    "Bilbo Baggins embarks on an unexpected adventure in J.R.R. Tolkien's 'The Hobbit'.",
    "'Brave New World' by Aldous Huxley depicts a future society driven by technological advancements."
]

Extract features with GPT:

In [3]:
prompt = """
You are provided with a text in triple backticks. Extract the following details from the text along with their specified data types:

- "Title" (string)
- "Author" (string)
- "Main Character" (string)
- "Plot Summary" (string)

Ensure the response is in JSON format and only includes information explicitly mentioned within the text.


Text:
```
{text}
```

Output:
"""

In [17]:
results = []
for text in tqdm(book_summaries):
    response = openai.Completion.create(
      model='gpt-3.5-turbo-instruct',
      prompt=prompt.format(text=text),
      max_tokens=128,
      temperature=0
    )

    r = json.loads(response.choices[0].text.strip('```'))
    results.append(r)

results

[{'Title': 'Pride and Prejudice',
  'Author': 'Jane Austen',
  'Main Character': 'Elizabeth Bennet and Mr. Darcy',
  'Plot Summary': 'Elizabeth Bennet and Mr. Darcy navigate societal pressures and find love.'},
 {'Title': "Harry Potter and the Sorcerer's Stone",
  'Author': 'J.K. Rowling',
  'Main Character': 'Harry Potter',
  'Plot Summary': 'Introduces us to a young wizard discovering his identity.'},
 {'Title': 'To Kill a Mockingbird',
  'Author': 'Harper Lee',
  'Main Character': 'Scout Finch',
  'Plot Summary': 'Scout Finch witnesses racial injustice in her hometown.'},
 {'Title': 'The Hobbit',
  'Author': 'J.R.R. Tolkien',
  'Main Character': 'Bilbo Baggins',
  'Plot Summary': 'Bilbo Baggins embarks on an unexpected adventure'},
 {'Title': 'Brave New World',
  'Author': 'Aldous Huxley',
  'Main Character': 'Not specified',
  'Plot Summary': 'The novel depicts a future society driven by technological advancements.'}]

Display results as dataframe:

In [18]:
df = pd.DataFrame(results)
df['Document'] = book_summaries
df

Unnamed: 0,Title,Author,Main Character,Plot Summary
0,Pride and Prejudice,Jane Austen,Elizabeth Bennet and Mr. Darcy,Elizabeth Bennet and Mr. Darcy navigate societ...
1,Harry Potter and the Sorcerer's Stone,J.K. Rowling,Harry Potter,Introduces us to a young wizard discovering hi...
2,To Kill a Mockingbird,Harper Lee,Scout Finch,Scout Finch witnesses racial injustice in her ...
3,The Hobbit,J.R.R. Tolkien,Bilbo Baggins,Bilbo Baggins embarks on an unexpected adventure
4,Brave New World,Aldous Huxley,Not specified,The novel depicts a future society driven by t...


Save results:

In [10]:
df.to_csv('../data/book_summaries.csv', index=False)

# Execution for Resumes/CVs

Resumes/CVs to extract features:

In [11]:
documents = [
    "Emma Smith worked at FinBank as an Analyst from 2012-2014.",
    "Robert Brown has been a Senior Designer at ArtHouse since 2019.",
    "Olivia Johnson was a Research Fellow at BioLab between 2015 and 2017.",
    "Michael Williams was employed by HealthCare Inc. as a Nurse from 2009 to 2013.",
    "Sophia Taylor has experience as a Sales Manager at AutoCorp from 2016-2020."
]

Extract features with GPT:

In [12]:
prompt = """
You are provided with a text in triple backticks. Extract the following details from the text along with their specified data types:

- "Name" (string)
- "Company" (string)
- "Start Year" (integer)
- "End Year" (integer)
- "Position" (string)

Ensure the response is in JSON format and only includes information explicitly mentioned within the text.


Text:
```
{text}
```

Output:
"""

In [13]:
results = []
for text in tqdm(documents):
    response = openai.Completion.create(
      model='gpt-3.5-turbo-instruct',
      prompt=prompt.format(text=text),
      max_tokens=128,
      temperature=0
    )

    r = json.loads(response.choices[0].text.strip('```'))
    results.append(r)

results

  0%|          | 0/5 [00:00<?, ?it/s]

[{'Name': 'Emma Smith',
  'Company': 'FinBank',
  'Start Year': 2012,
  'End Year': 2014,
  'Position': 'Analyst'},
 {'Name': 'Robert Brown',
  'Company': 'ArtHouse',
  'Start Year': 2019,
  'End Year': None,
  'Position': 'Senior Designer'},
 {'Name': 'Olivia Johnson',
  'Company': 'BioLab',
  'Start Year': 2015,
  'End Year': 2017,
  'Position': 'Research Fellow'},
 {'Name': 'Michael Williams',
  'Company': 'HealthCare Inc.',
  'Start Year': 2009,
  'End Year': 2013,
  'Position': 'Nurse'},
 {'Name': 'Sophia Taylor',
  'Company': 'AutoCorp',
  'Start Year': 2016,
  'End Year': 2020,
  'Position': 'Sales Manager'}]

Save results in CSV file:

In [15]:
df = pd.DataFrame(results)
df['Document'] = documents
df

Unnamed: 0,Name,Company,Start Year,End Year,Position,Document
0,Emma Smith,FinBank,2012,2014.0,Analyst,Emma Smith worked at FinBank as an Analyst fro...
1,Robert Brown,ArtHouse,2019,,Senior Designer,Robert Brown has been a Senior Designer at Art...
2,Olivia Johnson,BioLab,2015,2017.0,Research Fellow,Olivia Johnson was a Research Fellow at BioLab...
3,Michael Williams,HealthCare Inc.,2009,2013.0,Nurse,Michael Williams was employed by HealthCare In...
4,Sophia Taylor,AutoCorp,2016,2020.0,Sales Manager,Sophia Taylor has experience as a Sales Manage...


In [16]:
df.to_csv('../data/resumes.csv', index=False)

# Execution for News Articles

News articles to extract features:

In [17]:
documents = [
    "A fire broke out at a warehouse in New York on June 10th.",
    "On August 3rd, a tornado struck Oklahoma, leaving 5 dead and 20 injured.",
    "Queen Elizabeth celebrated her 95th birthday on April 21st.",
    "A breakthrough in cancer research was announced on September 15th.",
    "A new species of frog was discovered in the Amazon rainforest on January 23rd."
]

Extract features with GPT:

In [18]:
prompt = """
You are provided with a text in triple backticks. Extract the following details from the text along with their specified data types:

- "Date" (date)
- "Location" (string)
- "Event" (string)
- "Impact" (string)

Ensure the response is in JSON format and only includes information explicitly mentioned within the text.


Text:
```
{text}
```

Output:
"""

In [19]:
results = []
for text in tqdm(documents):
    response = openai.Completion.create(
      model='gpt-3.5-turbo-instruct',
      prompt=prompt.format(text=text),
      max_tokens=128,
      temperature=0
    )

    r = json.loads(response.choices[0].text.strip('```'))
    results.append(r)

results

  0%|          | 0/5 [00:00<?, ?it/s]

[{'Date': 'June 10th',
  'Location': 'New York',
  'Event': 'fire',
  'Impact': 'warehouse'},
 {'Date': 'August 3rd',
  'Location': 'Oklahoma',
  'Event': 'tornado',
  'Impact': '5 dead and 20 injured'},
 {'Date': 'April 21st',
  'Location': 'N/A',
  'Event': "Queen Elizabeth's 95th birthday",
  'Impact': 'N/A'},
 {'Date': 'September 15th',
  'Location': 'Not mentioned',
  'Event': 'Breakthrough in cancer research',
  'Impact': 'Not mentioned'},
 {'Date': 'January 23rd',
  'Location': 'Amazon rainforest',
  'Event': 'discovered',
  'Impact': 'new species of frog'}]

Save results in CSV file:

In [20]:
df = pd.DataFrame(results)
df['Document'] = documents
df

Unnamed: 0,Date,Location,Event,Impact,Document
0,June 10th,New York,fire,warehouse,A fire broke out at a warehouse in New York on...
1,August 3rd,Oklahoma,tornado,5 dead and 20 injured,"On August 3rd, a tornado struck Oklahoma, leav..."
2,April 21st,,Queen Elizabeth's 95th birthday,,Queen Elizabeth celebrated her 95th birthday o...
3,September 15th,Not mentioned,Breakthrough in cancer research,Not mentioned,A breakthrough in cancer research was announce...
4,January 23rd,Amazon rainforest,discovered,new species of frog,A new species of frog was discovered in the Am...


In [21]:
df.to_csv('../data/news.csv', index=False)