# Summarise corpus

This file summarises the corpus used to train the models, and the model predictions

In [2]:
import pandas as pd

dataFolder = '/homedata/dveytia/Product_1_data'

## Training data

In [9]:
## Summarise training data
codedVariables1 = pd.read_csv(f'{dataFolder}/data/articleAnswers_formatted_2025-04-26.txt', delimiter='\t')
codedVariables2 = pd.read_csv(f'{dataFolder}/data/articleAnswers_notMRE_formatted_2025-05-26.txt' , delimiter='\t')
codedVariables = pd.concat([codedVariables1,codedVariables2])
codedVariables.groupby(['random_sample','relevant']).size()

random_sample  relevant
0              0            564
               1            638
1              0            842
               1           1044
dtype: int64

In [10]:
## Total number of included articles
sum(codedVariables.relevant)

1682

In [11]:
list(codedVariables)

['id',
 'language',
 'population',
 'relevant',
 'primary_research',
 'implemented',
 'outcome_quantitative',
 'policy_link',
 'outcome_codis_benefit',
 'outcome_effectiveness',
 'oro_type.CCS',
 'oro_type.Efficiency',
 'oro_type.MRE-Bio',
 'oro_type.MRE-Located',
 'oro_type.MRE-Ocean',
 'oro_type.CDR-BioPump',
 'oro_type.CDR-Cult',
 'oro_type.CDR-OAE',
 'oro_type.CDR-BC',
 'oro_type.CDR-Other',
 'ecosystem.Microalgae',
 'ecosystem.Macroalgae',
 'ecosystem.Seagrass',
 'ecosystem.Mangrove',
 'ecosystem.Salt marsh',
 'title',
 'abstract',
 'keywords',
 'random_sample',
 'intervention',
 'uncertain_screen',
 'relevant_allResearch']

In [19]:
oroTypeCols = [x for x in codedVariables.columns if 'oro_type' in x] 
codedVariables[codedVariables.relevant == 1][oroTypeCols].apply(sum)

oro_type.CCS            256
oro_type.Efficiency     158
oro_type.MRE-Bio        110
oro_type.MRE-Located    343
oro_type.MRE-Ocean      349
oro_type.CDR-BioPump    160
oro_type.CDR-Cult        53
oro_type.CDR-OAE        172
oro_type.CDR-BC         195
oro_type.CDR-Other       22
dtype: int64

# Number of articles relevant for mitigation branch (updated)

In [4]:
unseenTxt = f'{dataFolder}/data/all_unseen_mitigation_oros.txt'
unseen_mit = pd.read_csv(unseenTxt, delimiter='\t')

In [6]:
# Load unseen documents & apply prediction boundaries
unseen_df = pd.read_csv(f'{dataFolder}/data/all_unseen_mitigation_oros.txt', delimiter='\t') 
#unseen_df = unseen_df.rename(columns={'analysis_id':'id'})
unseen_df=unseen_df.dropna(subset=['abstract']).reset_index(drop=True)
# Choose which prediction boundaries to apply. 
unseen_df = unseen_df[(unseen_df['oro_any.M_Renewables - mean_prediction']>=0.5) | (unseen_df['oro_any.M_Increase_efficiency - mean_prediction']>=0.5) | (unseen_df['oro_any.M_CO2_removal_or_storage - mean_prediction']>=0.5)]
unseen_df['seen']=0

# Load unseen updated documents & apply prediction boundaries to just Mitigation branch
unseen_df2 = pd.read_csv(f'{dataFolder}/data/unique_references_UPDATE_13-05-2025.txt', delimiter='\t')
unseen_df2 = unseen_df2.rename(columns={'analysis_id':'id'})
unseen_df2=unseen_df2.dropna(subset=['abstract']).reset_index(drop=True)
pred_screen = pd.read_csv(f'{dataFolder}/outputs/predictions-compiled/oro_screen_update_predictions.csv')
pred_branch = pd.read_csv(f'{dataFolder}/outputs/predictions-compiled/oro_branch_update_predictions.csv')
unseen_df2 = unseen_df2.merge(pred_screen, how="left").merge(pred_branch, how="left")
# Choose which predictiction boundaries to apply
unseen_df2 = unseen_df2[unseen_df2['0 - relevance - mean_prediction']>=0.5] 
unseen_df2 = unseen_df2[unseen_df2['oro_branch.Mitigation - mean_prediction']>=0.5]
unseen_df2['seen']=0

# Merge two unseen data frames together
unseen_df = (pd.concat([unseen_df[["id","title", "abstract", "keywords","seen"]], unseen_df2[["id","title", "abstract", "keywords","seen"]]])
             .sort_values('id')
             .sample(frac=1, random_state=1)
             .reset_index(drop=True)
            )

  unseen_df2 = pd.read_csv(f'{dataFolder}/data/unique_references_UPDATE_13-05-2025.txt', delimiter='\t')


In [10]:
unseen_df.shape

(58952, 5)