In [None]:
from eventclf.v2.models.event_llm.event_llm import EventLLM, LLMModels
from eventclf.v2.models.event_llm.prompts import EVENT_ATTENTENCE_TEMPLATE
from eventclf.v2.data_processing.data_processing import _clean_text

In [None]:
import pandas as pd 
from sklearn.metrics import f1_score, accuracy_score, precision_score,recall_score

In [None]:
import os 
from dotenv import load_dotenv
import numpy as np
import torch

In [None]:
load_dotenv()

# Load Data

In [None]:
data_lists = ['farnborough',"rugby"]
data_dict = {
    event:{f"{df}_data":pd.read_csv(f"../../../../data/processed_data/{event}_{df}.csv") for df in ["train","test"]}for event in data_lists 
}

# Prompts

In [None]:
data_dict['farnborough']['event'] = "Farnborough International Airshow 2024"
data_dict['farnborough']['event_description'] = "An industry convention for those in the Aerospace, aviation and defence industry to showcase advancements and innovations in those areas"
data_dict['farnborough']['examples'] = """
learning from theses examples classify the post:

EXAMPLE 1:
Post: Excellent flying seen today, always a thrill seeing the new E-156, very loud and very fast
Response: Yes 

EXAMPLE 2: 
Post:Succesful first day of @FIAFarnborough, signed a deal with @UserHandle on fuel emmision standards
Response: Yes 

EXAMPLE 3: 
Post: Cant believe how amazing it was seeing the F-14 aicrafts #putneyairshow
Response: No

EXAMPLE 4: 
Post: Farnborough Airshow facing a wave of protests around support for Qatar
Response: No
"""

In [None]:
data_dict['rugby']['event'] = "Scottish Rugby Autumn Internationals Scotland Vs New Zealand"
data_dict['rugby']['event_description'] = "A rugby match played by Scotland against New Zealand at Murrayfield stadium"
data_dict['rugby']['examples'] = """
learning from theses examples classify the post:
EXAMPLE 1:
Post: Cannot believe the queues at murrayfield, going to miss the rugby at this rate
Response: Yes 

EXAMPLE 2: 
Post:Such a heartwarming dedication to to Doddy in the stadium
Response: Yes 

EXAMPLE 3: 
Post: We dont need to be giving away fouls like that @Scottishrugby, PLAY BETTER!!!! 
Response: No

EXAMPLE 4: 
Post: Love to see it!! The all blacks smashing england. DISTRUCTION
Response: No
_________
"""

In [None]:
for event_dict in data_dict.values():
    event_dict['X_train'] = [_clean_text(text,[]) for text in event_dict['train_data']['text']]
    event_dict['X_test'] = [_clean_text(text,[]) for text in event_dict['test_data']['text']]
    event_dict['y_train'] = np.array(event_dict['train_data']['y'])
    event_dict['y_test'] = np.array(event_dict['test_data']['y'])

## Set up model

In [None]:
EVENT_ATTENTENCE_TEMPLATE = """
Your task is to determine whether a social media posts suggests that the author or another person attended a particular event.

{event} is {event_description}

{examples}

Instructions:
   - Yes: Consider the event description provided. Classify as "Yes" if it includes mention or implication that the author participated in or will participate in, or was physically present at the event.
   - No: Classify as "No" if the social media post does not suggest the author attended the event, it is about the event but does not indicate past or future attendance.
   
Ensure the post references the specific event and indicates the poster being physically present
Return only either "Yes" or "No"


Post: {post}

Response:
"""

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
event_llm = EventLLM(access_token=os.environ['HF_ACCESS_TOKEN'],batch_size=5,model=LLMModels.llama_3_1_8b_instruct.value,device=device)

In [None]:
res_list = []
for event,event_dict in data_dict.items():
    for examples in [event_dict['examples'],""]:
        prompt_dict = {
            "event":event_dict['event'],
            "event_description":event_dict['event_description'],
            "examples":examples
        }
        for run in ['train','test']:
            res = event_llm.predict(
                prompt_template=EVENT_ATTENTENCE_TEMPLATE,
                prompt_inputs=prompt_dict,
                posts=event_dict[f"X_{run}"]
            )
            res_clean = np.array([abs(1-event_dict[f"y_{run}"][idx]) if np.isnan(i) else i for idx,i in enumerate(res)])
            res_dict ={
                "event":event,
                "stratagy": "few shot" if len(examples) > 0 else "zero shot",
                "run":run,
                "res":res,
                "f1":f1_score(event_dict[f"y_{run}"],res_clean),
                "precision":precision_score(event_dict[f"y_{run}"],res_clean),
                "recall":recall_score(event_dict[f"y_{run}"],res_clean),
                "accuracy":accuracy_score(event_dict[f"y_{run}"],res_clean)
            }
            res_list.append(res_dict)

In [None]:
import json

In [None]:
with open("llm_results.json",'w') as fp:
    json.dump(res_list,fp)

In [None]:
res_list

In [None]:
examples = event_dict['examples']

prompt_dict = {
    "event":event_dict['event'],
    "event_description":event_dict['event_description'],
    "examples":""
}

In [None]:
run = "test"

In [None]:
res = event_llm.predict(
    prompt_template=EVENT_ATTENTENCE_TEMPLATE,
    prompt_inputs=prompt_dict,
    posts=event_dict[f"X_{run}"],
    max_new_tokens=20,
    temperature=0.1,
)

In [None]:
event_llm._out

In [None]:
accuracy_score(event_dict[f"y_{run}"],np.nan_to_num(res,nan=2))

In [None]:
pd.Series(event_llm._out)[(np.array(res)==1) & (event_dict['y_train']==0)].tolist()

In [None]:
pd.Series(event_dict['X_train'])[(np.array(res)==0) & (event_dict['y_train']==1)].tolist()

In [None]:
np.where(event_dict['y_train']==0)

In [None]:
event_dict['X_train']

In [None]:
np.is