In [2]:
import numpy as np

In [3]:
import pandas as pd

# Load Excel file into a DataFrame
sheet_name = 'Sheet1'  # Replace with the actual sheet name if needed
CHS = pd.read_excel('CHS_files.xlsx', sheet_name=sheet_name)
CHE = pd.read_excel('CHE_files.xlsx', sheet_name=sheet_name)
# Specify the column to check for empty records
column_to_check = 'abstract'  # Replace with the actual column name

# Check how many records are empty in the specified column
empty_records_count_che = CHE[column_to_check].isna().sum()
empty_records_count_chs = CHS[column_to_check].isna().sum()

print(f'The number of empty records in the column "{column_to_check}" for CHE file is: {empty_records_count_che} out of {CHE.Label.count()}')
print(f'The number of empty records in the column "{column_to_check}" for CHS file is: {empty_records_count_chs} out of {CHS.Label.count()}')


The number of empty records in the column "abstract" for CHE file is: 376 out of 9788
The number of empty records in the column "abstract" for CHS file is: 1036 out of 10153


In [4]:
non_abstract_CHS = CHS[CHS.abstract.isna()].reset_index()

In [5]:
without_url = non_abstract_CHS[non_abstract_CHS.url.isna()].shape[0]

In [6]:
data_summary = pd.DataFrame(columns=['CHE', 'CHS'],index=['Relevant','No Relevant', 'Total'])
data_summary.loc['Relevant','CHE'] = CHE[CHE.Label == 'Relevant'].shape[0]
data_summary.loc['No Relevant','CHE'] = CHE[CHE.Label != 'Relevant'].shape[0]
data_summary.loc['Total','CHE'] = CHE.shape[0]
data_summary.loc['Relevant','CHS'] = CHS[CHS.Label == 'Relevant'].shape[0]
data_summary.loc['No Relevant','CHS'] = CHS[CHS.Label != 'Relevant'].shape[0]
data_summary.loc['Total','CHS'] = CHS.shape[0]

In [7]:
non_abs_data_summary = pd.DataFrame(columns=['CHE_nabs', 'CHS_nabs'],index=['Relevant','No Relevant', 'Total'])
non_abs_data_summary.loc['Relevant','CHE_nabs'] = CHE[(CHE.Label == 'Relevant') & (CHE.abstract.isna())].shape[0]
non_abs_data_summary.loc['No Relevant','CHE_nabs'] = CHE[(CHE.Label != 'Relevant') & (CHE.abstract.isna())].shape[0]
non_abs_data_summary.loc['Total','CHE_nabs'] = CHE[CHE.abstract.isna()].shape[0]
non_abs_data_summary.loc['Relevant','CHS_nabs'] = CHS[(CHS.Label == 'Relevant') & (CHS.abstract.isna())].shape[0]
non_abs_data_summary.loc['No Relevant','CHS_nabs'] = CHS[(CHS.Label != 'Relevant') & (CHS.abstract.isna())].shape[0]
non_abs_data_summary.loc['Total','CHS_nabs'] = CHS[CHS.abstract.isna()].shape[0]

In [8]:
overview = data_summary.join(non_abs_data_summary)
overview["CHE_nabs%"] = (overview.CHE_nabs / overview.CHE)
overview["CHE_nabs%"] = overview["CHE_nabs%"] * 100
overview["CHE_nabs%"] = overview["CHE_nabs%"].round(2)
overview["CHE_nabs%"] = overview["CHE_nabs%"].apply(lambda x: f"{x:.2f}%")
overview["CHS_nabs%"] = (overview.CHS_nabs / overview.CHS)
overview["CHS_nabs%"] = overview["CHS_nabs%"] * 100
overview["CHS_nabs%"] = overview["CHS_nabs%"].round(2)
overview["CHS_nabs%"] = overview["CHS_nabs%"].apply(lambda x: f"{x:.2f}%")
overview.loc["Relevant","CHE_nourl_noabs"] = CHE[(CHE.abstract.isna()) & (CHE.url.isna()) & (CHE.Label=="Relevant")].shape[0]
overview.loc["No Relevant","CHE_nourl_noabs"] = CHE[(CHE.abstract.isna()) & (CHE.url.isna()) & (CHE.Label!="Relevant")].shape[0]
overview.loc["Total","CHE_nourl_noabs"] = CHE[(CHE.abstract.isna()) & (CHE.url.isna())].shape[0]
overview.loc["Relevant","CHS_nourl_noabs"] = CHS[(CHS.abstract.isna()) & (CHS.url.isna()) & (CHS.Label=="Relevant")].shape[0]
overview.loc["No Relevant","CHS_nourl_noabs"] = CHS[(CHS.abstract.isna()) & (CHS.url.isna()) & (CHS.Label!="Relevant")].shape[0]
overview.loc["Total","CHS_nourl_noabs"] = CHS[(CHS.abstract.isna()) & (CHS.url.isna())].shape[0]


In [9]:
overview.columns

Index(['CHE', 'CHS', 'CHE_nabs', 'CHS_nabs', 'CHE_nabs%', 'CHS_nabs%',
       'CHE_nourl_noabs', 'CHS_nourl_noabs'],
      dtype='object')

In [10]:
# Manually specify the desired column order
new_order = ['CHE', 'CHE_nabs', 'CHE_nabs%','CHE_nourl_noabs', 'CHS', 'CHS_nabs', 'CHS_nabs%', 'CHS_nourl_noabs' ]

# Reindex the DataFrame with the new column order
overview = overview[new_order]


In [11]:
overview

Unnamed: 0,CHE,CHE_nabs,CHE_nabs%,CHE_nourl_noabs,CHS,CHS_nabs,CHS_nabs%,CHS_nourl_noabs
Relevant,2663,64,2.40%,46.0,761,59,7.75%,34.0
No Relevant,7125,312,4.38%,247.0,9392,977,10.40%,539.0
Total,9788,376,3.84%,293.0,10153,1036,10.20%,573.0


In [12]:
# The allowed length for the model is up to 1024 words, so just checking one abstract
len(CHE.loc[0, 'abstract'])

649

In [13]:
# Calculate the length of each abstract
CHE['abstract_length'] = CHE['abstract'].apply(lambda x: len(x) if type(x) is not float else 0)

# Find the index of the longest abstract
longest_abstract_index = CHE['abstract_length'].idxmax()

# Retrieve the longest abstract
longest_abstract = CHE.loc[longest_abstract_index, 'abstract_length']

print("The longest abstract is:", longest_abstract)

The longest abstract is: 17500


In [22]:
# Calculate the length of each abstract
CHS['abstract_length'] = CHS['abstract'].apply(lambda x: len(x) if type(x) is not float else 0)

# Find the index of the longest abstract
longest_abstract_index = CHS['abstract_length'].idxmax()

# Retrieve the longest abstract
longest_abstract = CHS.loc[longest_abstract_index, 'abstract']

print("The longest abstract is:", longest_abstract)

The longest abstract is: Toxicology of disinfectants and disinfectant by-products. Disinfectants. Chlorine gas, chloramine and chlorine dioxide are strong respiratory irritants. Sodium hypochlorite (NaOC1) is also used as bleach and is frequently involved in human poisoning. These exposures, however, are not relevant to exposures in drinking-water. There have been relatively few evaluations of the toxic effects of these disinfectants in drinking-water in experimental animals or humans. Evidence from these animal and human studies suggests that chlorine, hypochlorite solutions, chloramine and chlorine dioxide themselves probably do not contribute to the development of cancer or any toxic effects. Attention has focused on the wide variety of by-products that result from reactions of chlorine and other disinfectants with NOM, which is found in virtually all water sources. Trihalomethanes. THMs induce cytotoxicity in the liver and kidneys of rodents exposed to doses of about 0.5 mmol/kg of

In [15]:
CHE[CHE.abstract_length>1500].shape[0]

3331

In [16]:
CHS[CHS.abstract_length>1500].shape[0]

3465

In [21]:
#To use as input for the model we just need the abstract and the label

input = CHE[['abstract','Label']]