In [None]:
import pandas as pd 
import re
import openai 
from transformers import pipeline 
from sklearn.metrics.pairwise import cosine_similarity 
from sentence_transformers import SentenceTransformer 
import hashlib 
import text_lloom.workbench as wb
import os
import yake 
import pypandoc 
from pathlib import Path 

from utils import turn_level_annotation, referee_report_annotation

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
files = []
dataframes = [] 
for f in files: 
    tmp = pd.read_csv(f) 
    tmp = tmp.reset_index(names='conversation_order')
    tmp['content_id'] = (tmp['time'] + tmp['session_id'] + tmp['role'] + tmp['content']).apply(lambda x: hashlib.sha256(x.encode()).hexdigest()) 
    dataframes.append(tmp)

df = pd.concat(dataframes) 

In [None]:
report_files = []

In [None]:
openai_api_key = ""
os.environ["OPENAI_API_KEY"] = openai_api_key 
client = openai.OpenAI(api_key=openai_api_key)
model = 'gpt-4o-mini-2024-07-18'

In [None]:
model_path = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
sentiment_analyzer = pipeline('sentiment-analysis', model=model_path, tokenizer=model_path, truncation=True, max_length=512)

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

### Word Count 

In [None]:
df['word_count'] = df['content'].apply(lambda x: len(re.findall(r'\S+', x)))

### Time Between Responses 

In [None]:
df['datetime'] = pd.to_datetime(df['time'])
df['time_spent'] = df.groupby('session_id')['datetime'].diff().dt.total_seconds() / 60 
df = df.drop(columns=['datetime'])

### Annotation 

In [None]:
annotated_data = [] 
past_messages = [] 
for row in df.to_dict('records'): 
    if row['role'] == 'user': 
        annotation = turn_level_annotation(client, model, past_messages, row['content'])
        past_messages.append(row) 
        row.update(annotation) 
        annotated_data.append(row) 
    else: 
        past_messages.append(row) 
        annotated_data.append(row) 

df = pd.DataFrame(annotated_data)

### Sentiment Analysis

In [None]:
df['sentiment'] = df['content'].apply(lambda x: sentiment_analyzer(x)[0]['label']).map({'positive': 1, 'neutral': 0, 'negative': -1})

### Topic Coverage 

Use LLooM

In [None]:
l = wb.lloom(
    df=df.loc[df['role'] == 'user'],
    text_col="content", 
    id_col='content_id' 
)

In [None]:
score_df = await l.gen_auto(
    max_concepts=5, 
    seed="Discussion points for a peer review of an academic finance article", 
    n_synth=1, 
    debug=False
)

### Keywords

In [None]:
def extract_keywords(text):
    extractor = yake.KeywordExtractor(top=10)
    keywords_scores = extractor.extract_keywords(text)
    keywords = [kw for kw, score in keywords_scores]
    return keywords

df['keywords'] = df['content'].apply(extract_keywords)

### Consecutive Semantic Similarity 

Measure the semantic similarity between interviewer message and user response in consecutive terms 

In [None]:
annotated_data = [] 
last_interviewer_msg = None 
for row in df.to_dict('records'): 
    if row['role'] == 'user': 
        interviewer = embedding_model.encode(last_interviewer_msg)
        user = embedding_model.encode(row['content'])
        row['semantic_similarity'] = cosine_similarity(interviewer.reshape(1, -1), user.reshape(1, -1))[0][0]
    else: 
        last_interviewer_msg = row['content']
    annotated_data.append(row) 

In [None]:
df = pd.DataFrame(annotated_data)

### Final Report Annotation 

In [None]:
data = [] 
for fpath in report_files: 
    fpath = Path(fpath) 
    fname = fpath.name
    username = fname.split('+')[1]
    report = pypandoc.convert_file(
        fpath, 
        'markdown', 
        format='docx'
    )
    annotation = referee_report_annotation(client, model, report)
    annotation['user'] = username 
    data.append(annotation)
report_df = pd.DataFrame(data) 

### Save Data 

In [None]:
df.to_csv('./turn_level_data.csv', index=False)

In [None]:
topic_df = score_df.rename(columns={'doc_id': 'content_id'}).merge(df, on='content_id', how='left').sort_values(['user', 'time'])

In [None]:
topic_df.to_csv('./topic_level_data.csv', index=False)

In [None]:
report_df.to_csv('./report_level_data.csv', index=False)