In [None]:
from pypdf import PdfReader 
import pypandoc 
from pathlib import Path 
import text_lloom.workbench as wb
import pandas as pd 
import os 

In [None]:
openai_api_key = ""
os.environ["OPENAI_API_KEY"] = openai_api_key 

In [None]:
data_fpath = Path('./data') 

In [None]:
all_r1_texts = [] 
for fpath in data_fpath.glob('*/R1.*'): 
    if str(fpath).endswith('pdf'): 
        reader = PdfReader(fpath)
        full_text = "" 
        for page in reader.pages: 
            full_text += page.extract_text() 
    elif str(fpath).endswith('docx'): 
        full_text = pypandoc.convert_file(
            fpath, 
            'markdown',
            format='docx'
        )
    all_r1_texts.append({
        'doc_id': f"{str(fpath.parent).split('/')[1]}_R1", 
        'content': full_text 
    })


all_r2_texts = [] 
for fpath in data_fpath.glob('*/R2.*'): 
    if str(fpath).endswith('pdf'): 
        reader = PdfReader(fpath)
        full_text = "" 
        for page in reader.pages: 
            full_text += page.extract_text() 
    elif str(fpath).endswith('docx'): 
        full_text = pypandoc.convert_file(
            fpath, 
            'markdown',
            format='docx'
        )
    all_r2_texts.append({
        'doc_id': f"{str(fpath.parent).split('/')[1]}_R2", 
        'content': full_text 
    })


all_r3_texts = [] 
for fpath in data_fpath.glob('*/R3.*'): 
    if str(fpath).endswith('pdf'): 
        reader = PdfReader(fpath)
        full_text = "" 
        for page in reader.pages: 
            full_text += page.extract_text() 
    elif str(fpath).endswith('docx'): 
        full_text = pypandoc.convert_file(
            fpath, 
            'markdown',
            format='docx'
        )
    all_r3_texts.append({
        'doc_id': f"{str(fpath.parent).split('/')[1]}_R3", 
        'content': full_text 
    })

In [None]:
r1_df = pd.DataFrame(all_r1_texts)
r2_df = pd.DataFrame(all_r2_texts)
r3_df = pd.DataFrame(all_r3_texts)
all_df = pd.DataFrame(all_r1_texts + all_r2_texts + all_r3_texts)

## R1 Topics

In [None]:
l = wb.lloom(
    df=r1_df,
    text_col="content", 
    id_col='doc_id' 
)
r1_score_df = await l.gen_auto(
    max_concepts=10, 
    seed="Discussion points in a referee report for an academic finance article based on the paper's content", 
    n_synth=1, 
    debug=False
)

In [None]:
r1_score_df.merge(r1_df, on='doc_id', how='left').to_parquet('./data/r1_dataset.parquet', index=False)

## R2 Topics

In [None]:
l = wb.lloom(
    df=r2_df,
    text_col="content", 
    id_col='doc_id' 
)
r2_score_df = await l.gen_auto(
    max_concepts=10, 
    seed="Discussion points in a referee report for an academic finance article based on the paper's content", 
    n_synth=1, 
    debug=False
)

In [None]:
r2_score_df.merge(r2_df, on='doc_id', how='left').to_parquet('./data/r2_dataset.parquet', index=False)

## R3 Topics

In [None]:
l = wb.lloom(
    df=r3_df,
    text_col="content", 
    id_col='doc_id' 
)
r3_score_df = await l.gen_auto(
    max_concepts=10, 
    seed="Specific discussion points in a referee report for an academic finance article based solely on the paper's content. Do not include the editorial recommendation", 
    n_synth=1, 
    debug=False
)

In [None]:
r3_score_df.loc[r3_score_df['concept_prompt'] == "Does the text address the intergenerational consequences of technological shocks on workers' children?", 'concept_name'] = 'Effect of Technological Shocks on Children'
r3_score_df.loc[r3_score_df['concept_prompt'] == "Does the text example examine the intergenerational effects of economic changes or disruptions?", 'concept_name'] = 'Intergenerational Effects of Economic Changes or Disruptions'

In [None]:
r3_score_df.merge(r3_df, on='doc_id', how='left').to_parquet('./data/r3_dataset.parquet', index=False)

## R1 + R2 + R3 Topics

In [None]:
l = wb.lloom(
    df=all_df,
    text_col="content", 
    id_col='doc_id' 
)
all_score_df = await l.gen_auto(
    max_concepts=8, 
    seed="General and high-level discussion points in a referee report for an academic finance article such as theory, methodology, data, results, policy implications, limitations, and directions for future research.", 
    n_synth=1, 
    debug=False
)

In [None]:
all_score_df.merge(all_df, on='doc_id', how='left').to_parquet('./data/all_dataset.parquet', index=False)

## NLP Scores 

In [None]:
from wordtangible import avg_text_concreteness 
from textblob import TextBlob
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from huggingface_hub import login

# This will prompt for your token or you can pass it directly
login(token='')

In [None]:
specificity_tokenizer = AutoTokenizer.from_pretrained("gtfintechlab/SubjECTiveQA-SPECIFIC", do_lower_case=True, do_basic_tokenize=True)
specificity_model = AutoModelForSequenceClassification.from_pretrained("gtfintechlab/SubjECTiveQA-SPECIFIC", num_labels=3)
specificity_config = AutoConfig.from_pretrained("gtfintechlab/SubjECTiveQA-SPECIFIC")
specificity_classifier = pipeline('text-classification', model=specificity_model, tokenizer=specificity_tokenizer, config=specificity_config, framework="pt")

In [None]:
all_df['round'] = all_df['doc_id'].str.split('_').str[1].str[1:].astype(int)

In [None]:
all_df['concreteness'] = all_df['content'].apply(avg_text_concreteness)
all_df['subjectiveness'] = all_df['content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
all_df['specificity'] = all_df['content'].apply(lambda x: specificity_classifier(x, batch_size=128, truncation='only_first', max_length=512)[0]['score'])

In [None]:
all_df.to_parquet('./data/nlp_scores.parquet', index=False)