# Environment

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

from langchain_openai import ChatOpenAI
from langchain.chains import create_extraction_chain

# Get the data

In [2]:
df = pd.read_csv('../data/trump.csv')
df.head()

Unnamed: 0,text,created_at,id_str
0,.@FoxNews is no longer the same. We miss the g...,05-19-2020 01:59:49,1262563582086184970
1,So the so-called HHS Whistleblower was against...,05-18-2020 14:44:21,1262393595560067073
2,.....mixed about even wanting us to get out. T...,05-18-2020 14:39:40,1262392415513690112
3,Wow! The Front Page @washingtonpost Headline r...,05-18-2020 12:47:40,1262364231288197123
4,MAGA crowds are bigger than ever! https://t.co...,05-18-2020 12:26:37,1262358931982123008


# OpenAI

In [3]:
schema = {
    "properties": {
        "person": {"type": "string"},
        "twitter_handle": {"type": "string"},
    }
}

# Run Chain
llm = ChatOpenAI()
chain = create_extraction_chain(schema, llm)

# Single

In [4]:
dfx = df.head()

In [5]:
for tweet in dfx['text']:
    response = chain.invoke(tweet)['text']
    print(response)

[{'person': 'Roger Ailes', 'twitter_handle': ''}]
[{'person': 'HHS Whistleblower', 'twitter_handle': '@NorahODonnell'}]
[{'person': 'us', 'twitter_handle': 'N/A'}, {'person': 'we', 'twitter_handle': 'N/A'}]
[{'person': 'Obama', 'twitter_handle': ''}, {'person': 'Front Page', 'twitter_handle': '@washingtonpost'}]
[{'person': 'MAGA', 'twitter_handle': 'zKxoTBw02v'}]


# Batch

In [6]:
dfx = df.head(4)

In [7]:
list_of_names = []
list_of_twitter_handles = []

In [8]:
# Iterate over batches of 2 tweets
batch_size = 2
for batch_start in range(0, len(dfx['text']), batch_size):
    batch_tweets = dfx['text'].iloc[batch_start:batch_start + batch_size]
    
    # Print each tweets in the batch
    for tweets in batch_tweets:
        response = chain.invoke(tweets)['text']

        if isinstance(response, list) and len(response) > 0:
            response_dict = response[0]
            
            # Check if 'person' and 'twitter_handle' keys exist in the dictionary
            person_name = response_dict.get('person')
            twitter_handle = response_dict.get('twitter_handle')
            
            if person_name != None and person_name != "" and person_name != "None":
                list_of_names.append(person_name)
            if twitter_handle != None and twitter_handle != "" and twitter_handle != "None":
                list_of_twitter_handles.append(twitter_handle)

In [9]:
print(list_of_names)
print(list_of_twitter_handles)

['Roger Ailes', 'HHS Whistleblower', 'us', 'Front Page']
['@NorahODonnell', '@washingtonpost']
