In [1]:
from nltk.parse.corenlp import CoreNLPParser,CoreNLPDependencyParser
import numpy as np
import random
import time
import pandas as pd
from tqdm.auto import tqdm, trange
import textstat
from datasets import load_dataset
from transformers import AutoTokenizer
import spacy_alignments as tokenizations

# import spacy
# nlp = spacy.load('en_core_web_sm') # or whatever model you have installed


In [2]:
depparser = CoreNLPDependencyParser(url='http://localhost:9000')
tokenizer = CoreNLPParser(url='http://localhost:9001')

In [3]:
# this conversion comes from https://universaldependencies.org/tagset-conversion/en-penn-uposf.html
# and another resource http://universaldependencies.org/docs/en/pos/all.html
# however, it's just impossible to convert to upos without knowing the context. 
# Manning's word here: https://github.com/UniversalDependencies/docs/issues/212#issuecomment-148846154
# how to use Manning's converter: https://github.com/clulab/processors/wiki/Converting-from-Penn-Treebank-to-Basic-Stanford-Dependencies

xpos2upos = {'#': 'SYM',
             '$': 'SYM',
             "''": 'PUNCT',
             ',': 'PUNCT',
             '-LRB-': 'PUNCT',
             '-RRB-': 'PUNCT',
             '.': 'PUNCT',
             ':': 'PUNCT',
             'ADD': 'X', # manually added. 
             'AFX': 'ADJ',
             'CC': 'CCONJ',
             'CD': 'NUM',
             'DT': 'DET',
             'EX': 'PRON',
             'FW': 'X',
             'GW': 'X', # manually added. 
             'HYPH': 'PUNCT',
             'IN': 'ADP',
             'JJ': 'ADJ',
             'JJR': 'ADJ',
             'JJS': 'ADJ',
             'LS': 'X',
             'MD': 'VERB',
             'NFP': 'PUNCT', # manually added. 
             'NIL': 'X',
             'NN': 'NOUN',
             'NNP': 'PROPN',
             'NNPS': 'PROPN',
             'NNS': 'NOUN',
             'PDT': 'DET',
             'POS': 'PART',
             'PRP': 'PRON',
             'PRP$': 'DET',
             'RB': 'ADV',
             'RBR': 'ADV',
             'RBS': 'ADV',
             'RP': 'ADP',
             'SYM': 'SYM',
             'TO': 'PART',
             'UH': 'INTJ',
             'VB': 'VERB',
             'VBD': 'VERB',
             'VBG': 'VERB',
             'VBN': 'VERB',
             'VBP': 'VERB',
             'VBZ': 'VERB',
             'WDT': 'DET',
             'WP': 'PRON',
             'WP$': 'DET',
             'WRB': 'ADV',
             'XX': 'X', # manually added. 
             '``': 'PUNCT'}
def convert_pos_seq(pos_seq):
    upos_seq = []
    for token in pos_seq:
        upos_seq.append(xpos2upos[token])
    return upos_seq

In [4]:
bert_checkpoint = 'bert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_checkpoint)

In [5]:
def align(text):
    corenlp_tokens = list(tokenizer.tokenize(text))
    bert_tokens = bert_tokenizer.tokenize(text)
    a2b, b2a = tokenizations.get_alignments(corenlp_tokens, bert_tokens)
    return [item for sublist in b2a for item in sublist]

In [6]:
def parse_dependency(text):
    parsed = depparser.raw_parse(text)
    conll_dep = next(parsed).to_conll(4)
    lines = conll_dep.split('\n')
    
    edge_index = []
    hetoro_edge = []
    pos = []
    num_syllable = []
    for i,line in enumerate(lines[:-1]):
        l = line.split('\t')
        edge_index.append([i+1, int(l[2])])
        hetoro_edge.append(l[3])
        pos.append(l[1])
        num_syllable.append(textstat.syllable_count(l[0]))
    upos = convert_pos_seq(pos)
    return edge_index, hetoro_edge, pos, upos, num_syllable
    

In [7]:
def process_additional_features(df):
    texts = df['text']
    
    edge_indexs = []
    hetoro_edges = []
    pos_seqs = []
    upos_seqs = []
    num_syllables = []
    alignments = []
    for text in tqdm(texts):
        text = text.replace('&amp;', '')
        edge_index, hetoro_edge, pos, upos, num_syllable = parse_dependency(text)
        edge_indexs.append(edge_index)
        hetoro_edges.append(hetoro_edge)
        pos_seqs.append(pos)
        upos_seqs.append(upos)
        num_syllables.append(num_syllable)
        
        alignment = align(text)
        alignments.append(alignment)
        
    df['edge_indexs'] = edge_indexs
    df['hetoro_edges'] = hetoro_edges
    df['pos_seqs'] = pos_seqs
    df['upos_seqs'] = upos_seqs
    df['num_syllables'] = num_syllables
    df['alignments'] = alignments
    
    return df

## ccat

In [8]:
def create_ccat50_subdataset(num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = '../../data/CCAT50/processed/'):
    assert num_authors_to_pick is None or picked_author_ids is None, "either specify 'num_authors_to_pick' or 'picked_author_ids'"
    
    df_ccat = pd.read_csv('../../data/CCAT50/processed/CCAT50_train.csv')
    df_ccat = df_ccat.reset_index().rename({'index':'doc_id'}, axis=1)
    df_ccat_val = pd.read_csv('../../data/CCAT50/processed/CCAT50_AA_val.csv')
    df_ccat_val = df_ccat_val.reset_index().rename({'index':'doc_id'}, axis=1)
    
    if not picked_author_ids:
        unique_authors = list(df_ccat['author_id'].unique())
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    if picked_author_ids == 'all':
        picked_author_ids = list(range(50))
        
    for split in ['train', 'val']:
        if split == 'train':
            df = df_ccat
        else:
            df = df_ccat_val
        
        doc_ids = []
        authors = []
        texts = []
        for author in picked_author_ids:
            df_temp = df[df['author_id'] == author]
            for i_doc in range(len(df_temp)):
                doc_id = df_temp['doc_id'].iloc[i_doc]
                doc = df_temp['text'].iloc[i_doc].split('\n')
                for i in range(len(doc)):
                    doc[i] = doc[i].strip()
                while '' in doc:
                    doc.remove('')
                for i in range(len(doc)-num_sent_per_text):
                    authors.append(author)
                    texts.append(' '.join(doc[i:i+num_sent_per_text]))
                    doc_ids.append(doc_id)
        df_new = pd.DataFrame({'author':authors, 'text':texts, 'doc_id':doc_ids})
        df_new = process_additional_features(df_new)
        str_author = ','.join(map(str, picked_author_ids)) if len(picked_author_ids) < 50 else 'all'
        file_name = f"author_{str_author}_sent_{num_sent_per_text}_{split}.csv"
        df_new.to_csv(f"{save_folder}/{file_name}", index=False)

In [None]:
ids = [[0,1],[2,3],[4,5]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/1293 [00:00<?, ?it/s]

  0%|          | 0/330 [00:00<?, ?it/s]

  0%|          | 0/1283 [00:00<?, ?it/s]

  0%|          | 0/309 [00:00<?, ?it/s]

  0%|          | 0/1175 [00:00<?, ?it/s]

  0%|          | 0/297 [00:00<?, ?it/s]

In [None]:
ids = [[0,1,2],[3,4,5],[6,7,8]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/1933 [00:00<?, ?it/s]

  0%|          | 0/484 [00:00<?, ?it/s]

  0%|          | 0/1818 [00:00<?, ?it/s]

  0%|          | 0/452 [00:00<?, ?it/s]

  0%|          | 0/1843 [00:00<?, ?it/s]

  0%|          | 0/485 [00:00<?, ?it/s]

In [None]:
ids = [[0,1,2,3],[4,5,6,7],[8,9,10,11]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/2576 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/2457 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/2451 [00:00<?, ?it/s]

  0%|          | 0/577 [00:00<?, ?it/s]

In [None]:
picked_author_ids = 'all'
nums = [1,2,3]

for num_sent_per_text in nums:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/32937 [00:00<?, ?it/s]

## guardian - topic confusion

In [None]:
class GuardianProcesser:
    def __init__(self, save_folder='../../data/guardian/processed/'):
        t1 = load_dataset('guardian_authorship', 'cross_topic_1')['train'].to_pandas()
        t2 = load_dataset('guardian_authorship', 'cross_topic_4')['train'].to_pandas()
        t3 = load_dataset('guardian_authorship', 'cross_topic_7')['train'].to_pandas()
        t4 = load_dataset('guardian_authorship', 'cross_topic_10')['train'].to_pandas()

        self.df_all = pd.concat([t1,t2,t3,t4])
        self.save_folder = save_folder
        
    def create_subdataset(self, authors = None, topics = [0,1], num_sent_per_text = None):
        # following description in paper The Topic Confusion Task: A novel evaluatino scenario for authorship attribution
        train_topic2authors = {topics[0]: authors[:len(authors)//2], topics[1]: authors[len(authors)//2:]}
        test_topic2authors = {topics[1]: authors[:len(authors)//2], topics[0]: authors[len(authors)//2:]}
        val_topic2authors = {t:authors for t in range(4) if t not in topics}
        topic2authors = [train_topic2authors, test_topic2authors, val_topic2authors]

        new_dfs = [pd.DataFrame()] * 3 # train, test, val, document level
        
        
        for topic in range(4):
            for author in authors:
                for split in range(3):
                    if (topic in topic2authors[split]) and (author in topic2authors[split][topic]):
                        df_temp = self.df_all[(self.df_all['author'] == author) * (self.df_all['topic'] == topic)]
                        new_dfs[split] = pd.concat([new_dfs[split], df_temp])

        split_name = ['train', 'test', 'val']
        for split in range(3):
            df = new_dfs[split]
            df_new = pd.DataFrame()
            for i_doc in range(len(df)):
                author = df['author'].iloc[i_doc]
                doc_id = df['doc_id'].iloc[i_doc]
                doc = nlp(df['article'].iloc[i_doc])
                doc = [sent.text.strip() for sent in doc.sents]
                while '' in doc:
                    doc.remove('')
                for i in range(len(doc)-num_sent_per_text):
                    df_new = df_new.append({'author':author, 'doc_id':doc_id, 'text':' '.join(doc[i:i+num_sent_per_text])}, ignore_index=True)
            df_new['author'] = df_new['author'].astype(int)
            df_new = add_dep_edges(df_new)
            
            str_topic = ','.join(map(str, topics))
            str_author = ','.join(map(str, authors))
            file_name = f"topic_{str_topic}_author_{str_author}_sent_{num_sent_per_text}_{split_name[split]}.csv"
            df_new.to_csv(f"{self.save_folder}/{file_name}", index=False)

In [None]:
processer = GuardianProcesser(save_folder = '../../data/guardian/processed/')

Found cached dataset guardian_authorship (/home/jz17d/.cache/huggingface/datasets/guardian_authorship/cross_topic_1/1.0.0/8c5f5675c8658367fcec31c02ac32be3a671d3eee703862e92b84b6b61e4fb38)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset guardian_authorship (/home/jz17d/.cache/huggingface/datasets/guardian_authorship/cross_topic_4/4.0.0/8c5f5675c8658367fcec31c02ac32be3a671d3eee703862e92b84b6b61e4fb38)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset guardian_authorship (/home/jz17d/.cache/huggingface/datasets/guardian_authorship/cross_topic_7/7.0.0/8c5f5675c8658367fcec31c02ac32be3a671d3eee703862e92b84b6b61e4fb38)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset guardian_authorship (/home/jz17d/.cache/huggingface/datasets/guardian_authorship/cross_topic_10/10.0.0/8c5f5675c8658367fcec31c02ac32be3a671d3eee703862e92b84b6b61e4fb38)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
authorss = [[0,1],[2,3],[4,5]]
num_sent_per_text = 2
topics = [0,1]

for authors in authorss:
    processer.create_subdataset(authors=authors, topics=topics, num_sent_per_text=num_sent_per_text)
    

  0%|          | 0/629 [00:00<?, ?it/s]

  0%|          | 0/518 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

  0%|          | 0/527 [00:00<?, ?it/s]

  0%|          | 0/871 [00:00<?, ?it/s]

  0%|          | 0/2042 [00:00<?, ?it/s]

  0%|          | 0/913 [00:00<?, ?it/s]

  0%|          | 0/474 [00:00<?, ?it/s]

  0%|          | 0/1817 [00:00<?, ?it/s]

## guardian

In [None]:
def create_guardian_subdataset(num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = '../../data/guardian/mix_topic/processed/'):
    train_df = load_dataset('guardian_authorship', name="cross_topic_1", split='train[:60%]+validation[:60%]+test[:60%]').to_pandas()
    val_df = load_dataset('guardian_authorship', name="cross_topic_1", split='train[60%:80%]+validation[60%:80%]+test[60%:80%]').to_pandas()
    test_df = load_dataset('guardian_authorship', name="cross_topic_1", split='train[-20%:]+validation[-20%:]+test[-20%:]').to_pandas()
    dfs = [train_df, val_df, test_df]
    
    if not picked_author_ids:
        unique_authors = list(df['author_id'].unique())
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    
    split_name = ['train', 'test', 'val']
    for split in range(3):
            df = dfs[split]
            authors = []
            texts = []
            for author in picked_author_ids:
                df_temp = df[df['author'] == author]
                for i_doc in range(len(df_temp)):
                    doc = nlp(df_temp['article'].iloc[i_doc])
                    doc = [sent.text.strip() for sent in doc.sents]
                    while '' in doc:
                        doc.remove('')
                    for i in range(len(doc)-num_sent_per_text):
                        authors.append(author)
                        texts.append(' '.join(doc[i:i+num_sent_per_text]))
            df_new = pd.DataFrame({'author':authors, 'text':texts})
            df_new = add_dep_edges(df_new)
            str_author = ','.join(map(str, picked_author_ids))
            file_name = f"author_{str_author}_sent_{num_sent_per_text}_{split_name[split]}.csv"
            df_new.to_csv(f"{save_folder}/{file_name}", index=False)

In [None]:
load_dataset('guardian_authorship', 'cross_topic_1')

TypeError: can only concatenate str (not "int") to str

In [None]:
load_dataset('guardian_authorship', name="cross_topic_1", split='train[:60%]+validation[:60%]+test[:60%]')

TypeError: can only concatenate str (not "int") to str

In [None]:
ids = [[0,1],[2,3],[4,5]]
num_sent_per_text = 2
save_folder = '../../data/guardian/mix_topic/processed/'

for picked_author_ids in ids:
    create_guardian_subdataset(picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text, save_folder=save_folder)
    

Downloading readme:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

TypeError: can only concatenate str (not "int") to str

## imdb

In [None]:
def create_imdb_subdataset(num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = '../../data/imdb/processed/'):
    assert num_authors_to_pick is None or picked_author_ids is None, "either specify 'num_authors_to_pick' or 'picked_author_ids'"
    
    df = pd.read_csv('../../data/imdb/processed/imdb62_train.csv')
    df_val = pd.read_csv('../../data/imdb/processed/imdb62_AA_val.csv')
    
    if not picked_author_ids:
        unique_authors = list(df['author_id'].unique())
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    
    for split in ['train', 'val']:
        if split == 'train':
            df_ = df
        else:
            df_ = df_val
        authors = []
        texts = []
        for author in picked_author_ids:
            df_temp = df_[df_['author_id'] == author]
            for i_doc in range(len(df_temp)):
                doc = nlp(df_temp['text'].iloc[i_doc])
                doc = [sent.text.strip() for sent in doc.sents]
                while '' in doc:
                    doc.remove('')
                for i in range(len(doc)-num_sent_per_text):
                    authors.append(author)
                    texts.append(' '.join(doc[i:i+num_sent_per_text]))
        df_new = pd.DataFrame({'author':authors, 'text':texts})
        df_new = add_dep_edges(df_new)
        str_author = ','.join(map(str, picked_author_ids))
        file_name = f"author_{str_author}_sent_{num_sent_per_text}_{split}.csv"
        df_new.to_csv(f"{save_folder}/{file_name}", index=False)

In [None]:
ids = [[0,1],[2,3],[4,5]]
num_sent_per_text = 2
save_folder = '../../data/imdb/processed/'

for picked_author_ids in ids:
    create_imdb_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text, save_folder=save_folder)


  0%|          | 0/17100 [00:00<?, ?it/s]

  0%|          | 0/5846 [00:00<?, ?it/s]

  0%|          | 0/20749 [00:00<?, ?it/s]

  0%|          | 0/6894 [00:00<?, ?it/s]