# Environment

In [1]:
import os
import pandas as pd
import json
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

from openai import OpenAI


# Get the data

In [2]:
df = pd.read_csv('../data/trump.csv')
df.head()

Unnamed: 0,text,created_at,id_str
0,.@FoxNews is no longer the same. We miss the g...,05-19-2020 01:59:49,1262563582086184970
1,So the so-called HHS Whistleblower was against...,05-18-2020 14:44:21,1262393595560067073
2,.....mixed about even wanting us to get out. T...,05-18-2020 14:39:40,1262392415513690112
3,Wow! The Front Page @washingtonpost Headline r...,05-18-2020 12:47:40,1262364231288197123
4,MAGA crowds are bigger than ever! https://t.co...,05-18-2020 12:26:37,1262358931982123008


In [3]:
sample_df = df.head(10)

# OpenAI / Prompt Engineering

In [4]:
client = OpenAI()

In [5]:
# List values of the 'text' column
tweets = sample_df['text'].tolist()

print(tweets)
print(type(tweets))

['.@FoxNews is no longer the same. We miss the great Roger Ailes. You have more anti-Trump people by far than ever before. Looking for a new outlet! https://t.co/jXxsF0flUM', 'So the so-called HHS Whistleblower was against HYDROXYCHLOROQUINE. Then why did he make and sign an emergency use authorization? @NorahODonnell said “He shared his concerns with a reporter.” In other words he LEAKED. A dumb @60Minutes hit job on a grandstanding Never Trumper!', '.....mixed about even wanting us to get out. They make a fortune $$$ by having us stay and except at the beginning we never really fought to win. We are more of a police force than the mighty military that we are especially now as rebuilt. No I am not acting impulsively!', 'Wow! The Front Page @washingtonpost Headline reads “A BOOST IN TESTS BUT LACK OF TAKERS.” We have done a great job on Ventilators Testing and everything else. Were left little by Obama. Over 11 million tests and going up fast. More than all countries in the world combi

In [6]:
list_responses = []
for tweet in tweets:
    response = client.chat.completions.create(
      model="gpt-3.5-turbo-1106",
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": 'You are a helpful assistant that extracts names, twitter handles, locations from tweets. For every tweet, list the names, twitter handles, locations, and the tweet. Use the json format in the output.'},
        {"role": "user", "content": tweet}
      ]
    )
    generated_content = response.choices[0].message.content
    list_responses.append(generated_content) 

In [7]:
list_responses

['{\n  "names": ["Roger Ailes"],\n  "twitter_handles": ["@FoxNews"],\n  "locations": [],\n  "tweet": ".@FoxNews is no longer the same. We miss the great Roger Ailes. You have more anti-Trump people by far than ever before. Looking for a new outlet! https://t.co/jXxsF0flUM"\n}',
 '{\n  "names": ["Norah O\'Donnell"],\n  "twitter_handles": ["@NorahODonnell"],\n  "locations": [],\n  "tweet": "So the so-called HHS Whistleblower was against HYDROXYCHLOROQUINE. Then why did he make and sign an emergency use authorization? @NorahODonnell said \\n“He shared his concerns with a reporter.” In other words he LEAKED. A dumb @60Minutes hit job on a grandstanding Never Trumper!"\n}',
 '{\n  "names": [],\n  "twitter_handles": [],\n  "locations": [],\n  "tweet": ".....mixed about even wanting us to get out. They make a fortune $$$ by having us stay and except at the beginning we never really fought to win. We are more of a police force than the mighty military that we are especially now as rebuilt. No 

# Extracting the values

In [8]:
list_of_names = []
list_of_twitter_handles = []
list_of_locations = []

In [9]:
# Extract key values from each JSON string
for json_str in list_responses:
    # Parse JSON string to a dictionary
    json_dict = json.loads(json_str)
    
    # Access specific keys and print their values
    names = json_dict.get('names', [])
    twitter_handles = json_dict.get('twitter_handles', [])
    locations = json_dict.get('locations', [])
    tweet = json_dict.get('tweet', '')
    
    if len(names) > 0:
        list_of_names.extend(names)
    if len(twitter_handles) > 0:
        list_of_twitter_handles.extend(twitter_handles)
    if len(locations) > 0:
        list_of_locations.extend(locations)


In [10]:
print("Names: " + str(list_of_names))
print("Twitter Handles: " + str(list_of_twitter_handles))
print("Locations: " + str(list_of_locations))

Names: ['Roger Ailes', "Norah O'Donnell", "Norah O'Donnell", 'Joe Biden']
Twitter Handles: ['@FoxNews', '@NorahODonnell', '@washingtonpost', '@60Minutes', '@SecAzar', '@NorahODonnell']
Locations: []


In [11]:
# Convert the list to a set to get unique items
print(set(list_of_names))
print(set(list_of_twitter_handles))
print(set(list_of_locations))

{'Joe Biden', "Norah O'Donnell", 'Roger Ailes'}
{'@SecAzar', '@60Minutes', '@FoxNews', '@NorahODonnell', '@washingtonpost'}
set()
