# EDA of IMDB Data

In [None]:
import pandas as pd
import numpy as np 
import datasets

def load_imdb_data():

# EDA of Longform Text Alignment Datasets

In [1]:
import pandas as pd 
import numpy as np
import os

In [None]:
CAUSALSENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
ACL_DATA_DIR = 'acl_data'

In [None]:
TOP_DIR = CAUSALSENT_DIR
df_cit = pd.read_parquet(os.path.join(TOP_DIR, ACL_DATA_DIR, 'acl_full_citations.parquet'))
df_pub = pd.read_parquet(os.path.join(TOP_DIR, ACL_DATA_DIR, 'acl-publication-info.74k.v2.parquet'))

In [4]:
df_cit.head()

Unnamed: 0,id,citingpaperid,citedpaperid,is_citedpaperid_acl,is_citingpaperid_acl
0,868703600,52046747,16228715,True,False
1,3216252045,229723601,1373518,True,False
2,437616166,16841192,629094,True,False
3,3349792001,235196103,19121210,True,False
4,3313665988,233365331,218974137,True,True


In [5]:
df_pub.head()

Unnamed: 0,acl_id,abstract,full_text,corpus_paper_id,pdf_hash,numcitedby,url,publisher,address,year,...,doi,number,volume,journal,editor,isbn,ENTRYTYPE,ID,language,note
0,O02-2002,There is a need to measure word similarity whe...,There is a need to measure word similarity whe...,18022704,0b09178ac8d17a92f16140365363d8df88c757d0,14,https://aclanthology.org/O02-2002,,,2002,...,,,,,,,inproceedings,chen-you-2002-study,,
1,L02-1310,,,8220988,8d5e31610bc82c2abc86bc20ceba684c97e66024,93,http://www.lrec-conf.org/proceedings/lrec2002/...,European Language Resources Association (ELRA),"Las Palmas, Canary Islands - Spain",2002,...,,,,,,,inproceedings,mihalcea-2002-bootstrapping,,
2,R13-1042,Thread disentanglement is the task of separati...,Thread disentanglement is the task of separati...,16703040,3eb736b17a5acb583b9a9bd99837427753632cdb,10,https://aclanthology.org/R13-1042,"INCOMA Ltd. Shoumen, BULGARIA","Hissar, Bulgaria",2013,...,,,,,,,inproceedings,jamison-gurevych-2013-headerless,,
3,W05-0819,"In this paper, we describe a word alignment al...","In this paper, we describe a word alignment al...",1215281,b20450f67116e59d1348fc472cfc09f96e348f55,15,https://aclanthology.org/W05-0819,Association for Computational Linguistics,"Ann Arbor, Michigan",2005,...,,,,,,,inproceedings,aswani-gaizauskas-2005-aligning,,
4,L02-1309,,,18078432,011e943b64a78dadc3440674419821ee080f0de3,12,http://www.lrec-conf.org/proceedings/lrec2002/...,European Language Resources Association (ELRA),"Las Palmas, Canary Islands - Spain",2002,...,,,,,,,inproceedings,suyaga-etal-2002-proposal,,


In [6]:
def get_positive_examples(df_cit: pd.DataFrame, 
                          df_pub: pd.DataFrame) -> pd.DataFrame:
    """ 
    Get positive examples for citation prediction task. Merges  the citations with the 
    paper details to get abstracts for the citing and cited papers.
    """
    
    # only keep data for ACL papers (otherwise merge will fail)
    df_cit_acl = df_cit[(df_cit['is_citedpaperid_acl'] == True) & (df_cit['is_citingpaperid_acl'] == True)]
    
    # get the citing abstract
    matched_abstracts = df_cit_acl = df_cit_acl[['citingpaperid', 'citedpaperid']].merge(
        df_pub[['corpus_paper_id', 'abstract']].rename(columns={'abstract': 'citing_abstract'}),
        left_on="citingpaperid", right_on="corpus_paper_id", how='inner'
    ).merge(  # get the cited abstract
        df_pub[['corpus_paper_id', 'abstract']].rename(columns={'abstract': 'cited_abstract'}),
        left_on="citedpaperid", right_on="corpus_paper_id", how='inner'
    )
        
    positive_examples = matched_abstracts
    positive_examples['label'] = 1   # indicates a citation pair (`citingpaperid` cited `citedpaperid`)
    
    positive_examples = positive_examples[['citingpaperid', 'citedpaperid', 'citing_abstract', 'cited_abstract', 'label']]
    
    return positive_examples

In [7]:
df_c_short = df_cit[(df_cit['is_citedpaperid_acl'] == True) & (df_cit['is_citingpaperid_acl'] == True)].head(100)

small_pos_examples = get_positive_examples(df_c_short, df_pub)
small_pos_examples.head()

Unnamed: 0,citingpaperid,citedpaperid,citing_abstract,cited_abstract,label
0,233365331,218974137,In this work we compare the performance of con...,There is an increasing demand for sentiment an...,1
1,3841628,18998986,"Election manifestos document the intentions, m...","In recent years, keyphrase extraction has rece...",1
2,218974058,204896994,In this paper we describe our work on the deve...,In order to automatically extend a treebank of...,1
3,53242563,15453873,We posed the shared task of assigning sentence...,This paper presents the current status of OPUS...,1
4,1009868,680757,We present the results of the WMT13 shared tas...,We illustrate and explain problems of n-grams-...,1


In [8]:
len(small_pos_examples)

100

In [9]:
def add_negative_labels(df_pos: pd.DataFrame, 
                        df_pub: pd.DataFrame, 
                        num_neg_samples: int = 1, 
                        verbose: bool = False) -> pd.DataFrame:
    """
    Add negative examples to the positive examples. For each positive example, sample
    `num_neg_samples` negative examples by randomly selecting a paper from the corpus, 
    and confirming that the paper was not cited by `citingpaperid` from 
    the positive example.
    """
    
    # create a dictionary of citing papers as keys and all their cited papers as values
    citing_to_cited = df_pos.groupby('citingpaperid')['citedpaperid'].apply(list).to_dict()
    
    # get the list of all papers in the corpus
    all_corpus_papers = df_pub['corpus_paper_id'].unique()
    
    # sample num_neg_samples negative examples for each positive example
    neg_examples = []   
    for citing_paper, cited_papers in citing_to_cited.items():
        for cited_paper in cited_papers:
            for _ in range(num_neg_samples):
                neg_cited_paper = np.random.choice(all_corpus_papers)
                while neg_cited_paper in cited_papers:   # resample if we sampled a paper that was cited by the citing paper
                    neg_cited_paper = np.random.choice(all_corpus_papers)
                neg_examples.append((citing_paper, neg_cited_paper, 0))
                
    # make df from negative examples, and join twice against df_pub to get abstracts
    df_neg = pd.DataFrame(neg_examples, columns=['citingpaperid', 'citedpaperid', 'label'])
    df_neg = df_neg.merge(df_pub[['corpus_paper_id', 'abstract']].rename(columns={'abstract': 'citing_abstract'}),
                          left_on='citingpaperid', right_on='corpus_paper_id', how='inner')
    df_neg = df_neg.merge(df_pub[['corpus_paper_id', 'abstract']].rename(columns={'abstract': 'cited_abstract'}),
                            left_on='citedpaperid', right_on='corpus_paper_id', how='inner')
    df_neg = df_neg[['citingpaperid', 'citedpaperid', 'citing_abstract', 'cited_abstract', 'label']]
    
    # stack the positive and negative examples
    df_all = pd.concat([df_pos, df_neg], ignore_index=True)
    
    if verbose: 
        # print out total number of positive and negative examples
        print(f"Number of positive examples: {len(df_pos)}")
        print(f"Number of negative examples: {len(df_neg)}")
    
    return df_all

In [10]:
full_df = add_negative_labels(small_pos_examples, df_pub, num_neg_samples=1, verbose=True)
full_df.head()


Number of positive examples: 100
Number of negative examples: 100


Unnamed: 0,citingpaperid,citedpaperid,citing_abstract,cited_abstract,label
0,233365331,218974137,In this work we compare the performance of con...,There is an increasing demand for sentiment an...,1
1,3841628,18998986,"Election manifestos document the intentions, m...","In recent years, keyphrase extraction has rece...",1
2,218974058,204896994,In this paper we describe our work on the deve...,In order to automatically extend a treebank of...,1
3,53242563,15453873,We posed the shared task of assigning sentence...,This paper presents the current status of OPUS...,1
4,1009868,680757,We present the results of the WMT13 shared tas...,We illustrate and explain problems of n-grams-...,1


In [11]:
full_df.tail()

Unnamed: 0,citingpaperid,citedpaperid,citing_abstract,cited_abstract,label
195,236771976,198876308,Multihop reasoning remains an elusive goal as ...,We present a system for the task of morphologi...,0
196,237157879,616564,"Nowadays, most research conducted in the field...",Many research efforts have been devoted to dev...,0
197,237502863,18021631,Non-autoregressive neural machine translation ...,"In this paper, we introduce TimeBankPT, a Time...",0
198,243865639,1664456,Event time is one of the most important featur...,This paper focuses on exploiting different mod...,0
199,243865646,12877573,"In this paper, we investigate the Aspect Categ...","In this paper we describe SoMaJo, a rulebased ...",0


## Actually make the full dataset

In [12]:
pos_df = get_positive_examples(df_cit, df_pub)
full_df = add_negative_labels(pos_df, df_pub, num_neg_samples=1, verbose=True)

Number of positive examples: 1339442
Number of negative examples: 1339509


In [13]:
full_df.head()

Unnamed: 0,citingpaperid,citedpaperid,citing_abstract,cited_abstract,label
0,233365331,218974137,In this work we compare the performance of con...,There is an increasing demand for sentiment an...,1
1,3841628,18998986,"Election manifestos document the intentions, m...","In recent years, keyphrase extraction has rece...",1
2,218974058,204896994,In this paper we describe our work on the deve...,In order to automatically extend a treebank of...,1
3,53242563,15453873,We posed the shared task of assigning sentence...,This paper presents the current status of OPUS...,1
4,1009868,680757,We present the results of the WMT13 shared tas...,We illustrate and explain problems of n-grams-...,1


In [14]:
full_df.tail()

Unnamed: 0,citingpaperid,citedpaperid,citing_abstract,cited_abstract,label
2678946,248525217,14387118,Canonical automatic summary evaluation metrics...,In this paper we address the issue of developi...,0
2678947,248525217,14926846,Canonical automatic summary evaluation metrics...,This paper presents a simple but effective app...,0
2678948,248525217,122829,Canonical automatic summary evaluation metrics...,We present a neural model for morphological in...,0
2678949,248525217,62483340,Canonical automatic summary evaluation metrics...,,0
2678950,248525217,241583646,Canonical automatic summary evaluation metrics...,With the essays part from The International Co...,0


In [1]:
import torch

In [4]:
# Example 1: 1D tensor
v1 = torch.tensor([1, 2, 3], dtype=torch.float32)
softmax_v1 = torch.softmax(v1, dim=0)  # Compute softmax across the single dimension
print("1D Tensor Softmax:", softmax_v1)

# Example 2: 2D tensor (row-wise softmax)
v2 = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)
softmax_v2_row = torch.softmax(v2, dim=1)  # Compute softmax across rows
print("2D Tensor Row-wise Softmax:\n", softmax_v2_row)

# Example 3: 2D tensor (column-wise softmax)
softmax_v2_col = torch.softmax(v2, dim=0)  # Compute softmax across columns
print("2D Tensor Column-wise Softmax:\n", softmax_v2_col)

# Example 4: Higher dimensional tensor
v3 = torch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=torch.float32)
softmax_v3_dim0 = torch.softmax(v3, dim=0)  # Softmax across the 0th dimension
softmax_v3_dim1 = torch.softmax(v3, dim=1)  # Softmax across the 1st dimension
softmax_v3_dim2 = torch.softmax(v3, dim=2)  # Softmax across the 2nd dimension
print("3D Tensor Softmax (dim=0):\n", softmax_v3_dim0)
print("3D Tensor Softmax (dim=1):\n", softmax_v3_dim1)
print("3D Tensor Softmax (dim=2):\n", softmax_v3_dim2)

1D Tensor Softmax: tensor([0.0900, 0.2447, 0.6652])
2D Tensor Row-wise Softmax:
 tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])
2D Tensor Column-wise Softmax:
 tensor([[0.0474, 0.0474, 0.0474],
        [0.9526, 0.9526, 0.9526]])
3D Tensor Softmax (dim=0):
 tensor([[[0.0180, 0.0180],
         [0.0180, 0.0180]],

        [[0.9820, 0.9820],
         [0.9820, 0.9820]]])
3D Tensor Softmax (dim=1):
 tensor([[[0.1192, 0.1192],
         [0.8808, 0.8808]],

        [[0.1192, 0.1192],
         [0.8808, 0.8808]]])
3D Tensor Softmax (dim=2):
 tensor([[[0.2689, 0.7311],
         [0.2689, 0.7311]],

        [[0.2689, 0.7311],
         [0.2689, 0.7311]]])
