In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [42]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
dev = pd.read_csv('dev.csv')


In [43]:
def process_paragraphs(train):
    # Extract paragraph ID from the index column
    train['paragraph_id'] = train['index'].apply(lambda x: x.split('_')[0])

    # Combine sentences into paragraphs
    train_paragraphs = train.groupby('paragraph_id')['sentence'].apply(' '.join).reset_index()

    # Extract unique atomic labels
    all_labels = train['labels'].dropna().str.split(',').explode().str.strip().unique()

    # Create a new dataframe with paragraph_id as the index
    label_columns = all_labels
    paragraph_labels = pd.DataFrame(0, index=train_paragraphs['paragraph_id'], columns=label_columns)

    # Fill in the label columns
    for _, row in train.dropna(subset=['labels']).iterrows():
        paragraph_id = row['paragraph_id']
        labels = row['labels'].split(',')
        for label in labels:
            label = label.strip()
            if label in paragraph_labels.columns:
                paragraph_labels.at[paragraph_id, label] = 1

    # Reset index to make paragraph_id a column
    paragraph_labels.reset_index(inplace=True)

    # Merge with train_paragraphs
    train_paragraphs = train_paragraphs.merge(paragraph_labels, on="paragraph_id")

    return train_paragraphs

In [44]:
train = process_paragraphs(train)
test = process_paragraphs(test)
dev = process_paragraphs(dev)
# print data shape[0]
print(train.shape[0])
print(test.shape[0])
print(dev.shape[0])

1108
315
157


In [45]:
def data_reconstruction(df):
    label_counts = df.iloc[:, 2:].sum().sort_values(ascending=False)
    label_name = label_counts.index[0]
    # change the data by creating a "label" column
    # for this column, judege whether the paragraph support the label_name or not by checking the value of the label_name column
    df['label'] = df[label_name].apply(lambda x: 1 if x > 0 else 0)
    # reconstruct the data into three columns: doc, claim, label
    # doc is just the paragraph, claim is a clumn with all values equal to the variable label_name, and label is the label column
    df = df[['sentence', label_name, 'label']]
    df = df.rename(columns={label_name: 'claim'})
    df['claim'] = label_name
    df = df.rename(columns={'sentence': 'doc'})
    return df


In [46]:
train = data_reconstruction(train)
test = data_reconstruction(test)
dev = data_reconstruction(dev)

In [47]:
# store three data to "reconstructon" folder
train.to_csv('reconstruction/train.csv', index=False)
test.to_csv('reconstruction/test.csv', index=False)
dev.to_csv('reconstruction/dev.csv', index=False)

In [3]:
import pandas as pd
train = pd.read_csv('reconstruction/train.csv')
train

Unnamed: 0,doc,claim,label
0,Ghrelin was identified in the stomach as an en...,sustaining proliferative signaling,1
1,PURPOSE The epidermal growth factor receptor (...,sustaining proliferative signaling,1
2,Adoptive transfer of immunity against hepatiti...,sustaining proliferative signaling,0
3,The secretion of immunosuppressive factors lik...,sustaining proliferative signaling,1
4,To characterize the impact of increased produc...,sustaining proliferative signaling,1
...,...,...,...
1103,Tumour cells primarily utilize aerobic glycoly...,sustaining proliferative signaling,0
1104,Our previous study demonstrated that 5-aminole...,sustaining proliferative signaling,0
1105,Ceramide is a sphingolipid metabolite that ind...,sustaining proliferative signaling,0
1106,High-throughput screening of a small-molecule ...,sustaining proliferative signaling,0
