In [None]:
from nltk.parse.corenlp import CoreNLPParser,CoreNLPDependencyParser
import numpy as np
import random
import time
import pandas as pd
from tqdm.auto import tqdm, trange
import textstat
import spacy
from datasets import load_dataset
nlp = spacy.load('en_core_web_sm') # or whatever model you have installed


In [None]:
depparser = CoreNLPDependencyParser(url='http://localhost:9000')

In [None]:
# this conversion comes from https://universaldependencies.org/tagset-conversion/en-penn-uposf.html
# and another resource http://universaldependencies.org/docs/en/pos/all.html
# however, it's just impossible to convert to upos without knowing the context. 
# Manning's word here: https://github.com/UniversalDependencies/docs/issues/212#issuecomment-148846154
# how to use Manning's converter: https://github.com/clulab/processors/wiki/Converting-from-Penn-Treebank-to-Basic-Stanford-Dependencies

xpos2upos = {'#': 'SYM',
             '$': 'SYM',
             "''": 'PUNCT',
             ',': 'PUNCT',
             '-LRB-': 'PUNCT',
             '-RRB-': 'PUNCT',
             '.': 'PUNCT',
             ':': 'PUNCT',
             'ADD': 'X', # manually added. 
             'AFX': 'ADJ',
             'CC': 'CCONJ',
             'CD': 'NUM',
             'DT': 'DET',
             'EX': 'PRON',
             'FW': 'X',
             'GW': 'X', # manually added. 
             'HYPH': 'PUNCT',
             'IN': 'ADP',
             'JJ': 'ADJ',
             'JJR': 'ADJ',
             'JJS': 'ADJ',
             'LS': 'X',
             'MD': 'VERB',
             'NFP': 'PUNCT', # manually added. 
             'NIL': 'X',
             'NN': 'NOUN',
             'NNP': 'PROPN',
             'NNPS': 'PROPN',
             'NNS': 'NOUN',
             'PDT': 'DET',
             'POS': 'PART',
             'PRP': 'PRON',
             'PRP$': 'DET',
             'RB': 'ADV',
             'RBR': 'ADV',
             'RBS': 'ADV',
             'RP': 'ADP',
             'SYM': 'SYM',
             'TO': 'PART',
             'UH': 'INTJ',
             'VB': 'VERB',
             'VBD': 'VERB',
             'VBG': 'VERB',
             'VBN': 'VERB',
             'VBP': 'VERB',
             'VBZ': 'VERB',
             'WDT': 'DET',
             'WP': 'PRON',
             'WP$': 'DET',
             'WRB': 'ADV',
             'XX': 'X', # manually added. 
             '``': 'PUNCT'}
def convert_pos_seq(pos_seq):
    upos_seq = []
    for token in pos_seq:
        upos_seq.append(xpos2upos[token])
    return upos_seq

In [None]:
def add_dep_edges(df):
    texts = df['text']
    homo_edges = []
    hetoro_edges = []
    pos_seqs = []
    upos_seqs = []
    num_syllables = []
    
    for text in tqdm(texts):
        parsed = depparser.raw_parse(text)
        conll_dep = next(parsed).to_conll(4)
        lines = conll_dep.split('\n')
        homo_edge = []
        hetoro_edge = []
        pos_seq = []
        num_syllable = []
        for i,line in enumerate(lines[:-1]):
            l = line.split('\t')
            homo_edge.append([i+1, int(l[2])])
            hetoro_edge.append(l[3])
            pos_seq.append(l[1])
            num_syllable.append(textstat.syllable_count(l[0]))
        homo_edges.append(homo_edge)
        hetoro_edges.append(hetoro_edge)
        pos_seqs.append(pos_seq)
        upos_seqs.append(convert_pos_seq(pos_seq))
        num_syllables.append(num_syllable)
        
    df['homo_edges'] = homo_edges
    df['hetoro_edges'] = hetoro_edges
    df['pos_seqs'] = pos_seqs
    df['upos_seqs'] = upos_seqs
    df['num_syllables'] = num_syllables
    
    return df

## ccat

In [None]:
def create_ccat50_subdataset(num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = '../../data/CCAT50/processed/'):
    assert num_authors_to_pick is None or picked_author_ids is None, "either specify 'num_authors_to_pick' or 'picked_author_ids'"
    
    df_ccat = pd.read_csv('../../data/CCAT50/processed/CCAT50_train.csv')
    df_ccat_val = pd.read_csv('../../data/CCAT50/processed/CCAT50_AA_val.csv')
    
    if not picked_author_ids:
        unique_authors = list(df_ccat['author_id'].unique())
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    
    for split in ['train', 'val']:
        if split == 'train':
            df = df_ccat
        else:
            df = df_ccat_val
        authors = []
        texts = []
        for author in picked_author_ids:
            df_temp = df[df['author_id'] == author]
            for i_doc in range(len(df_temp)):
                doc = df_temp['text'].iloc[i_doc].split('\n')
                for i in range(len(doc)):
                    doc[i] = doc[i].strip()
                while '' in doc:
                    doc.remove('')
                for i in range(len(doc)-num_sent_per_text):
                    authors.append(author)
                    texts.append(' '.join(doc[i:i+num_sent_per_text]))
        df_new = pd.DataFrame({'author':authors, 'text':texts})
        df_new = add_dep_edges(df_new)
        str_author = ','.join(map(str, picked_author_ids))
        file_name = f"author_{str_author}_sent_{num_sent_per_text}_{split}.csv"
        df_new.to_csv(f"{save_folder}/{file_name}", index=False)

In [None]:
ids = [[0,1],[2,3],[4,5]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/1293 [00:00<?, ?it/s]

  0%|          | 0/330 [00:00<?, ?it/s]

  0%|          | 0/1283 [00:00<?, ?it/s]

  0%|          | 0/309 [00:00<?, ?it/s]

  0%|          | 0/1175 [00:00<?, ?it/s]

  0%|          | 0/297 [00:00<?, ?it/s]

In [None]:
ids = [[0,1,2],[3,4,5],[6,7,8]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/1933 [00:00<?, ?it/s]

  0%|          | 0/484 [00:00<?, ?it/s]

  0%|          | 0/1818 [00:00<?, ?it/s]

  0%|          | 0/452 [00:00<?, ?it/s]

  0%|          | 0/1843 [00:00<?, ?it/s]

  0%|          | 0/485 [00:00<?, ?it/s]

In [None]:
ids = [[0,1,2,3],[4,5,6,7],[8,9,10,11]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/2576 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/2457 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/2451 [00:00<?, ?it/s]

  0%|          | 0/577 [00:00<?, ?it/s]

## guardian

In [None]:
df = pd.read_csv('../../data/guardian/guardian_articles.csv')
df

Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id
0,us-news/2016/jan/31/iowa-caucus-underdog-candi...,US news,Iowa underdogs put on brave faces despite all ...,https://www.theguardian.com/us-news/2016/jan/3...,As polling day looms and the cameras turn only...,2016-01-31T23:53:37Z,1
1,us-news/2016/jan/31/iowa-caucus-worlds-most-pa...,US news,Iowa caucus: hologram eagle and Jesus star on ...,https://www.theguardian.com/us-news/2016/jan/3...,"In Des Moines on Sunday, the Guardian was give...",2016-01-31T23:46:28Z,2
2,world/2016/jan/31/tanzania-britsh-helicopter-p...,World news,British pilot in Tanzania 'manoeuvred ​to save...,https://www.theguardian.com/world/2016/jan/31/...,A British pilot who was shot dead by an elepha...,2016-01-31T23:43:48Z,3
3,football/2016/jan/31/late-winner-gets-usa-off-...,Football,USA 3-2 Iceland | International friendly match...,https://www.theguardian.com/football/2016/jan/...,USA took a step toward shaking off the ghosts ...,2016-01-31T23:30:49Z,4
4,football/2016/jan/31/blackburn-paul-lambert-ox...,Football,Reinvigorated Paul Lambert reflects after impr...,https://www.theguardian.com/football/2016/jan/...,"The clean-shaven, spectacle free and suspiciou...",2016-01-31T22:30:10Z,5
...,...,...,...,...,...,...,...
149834,world/2022/jun/21/marble-head-of-hercules-pull...,World news,Marble head of Hercules pulled up from Roman s...,https://www.theguardian.com/world/2022/jun/21/...,"For archaeologists, it’s the underwater find t...",2022-06-21T17:31:32Z,149835
149835,music/2022/jun/22/i-got-sick-of-talking-about-...,Music,‘I got sick of talking about myself’: Spacey J...,https://www.theguardian.com/music/2022/jun/22/...,"From under a mop of curls, Caleb Harper – Spac...",2022-06-21T17:30:09Z,149836
149836,australia-news/2022/jun/22/the-small-town-with...,Australia news,The small town with a big potato that inspired...,https://www.theguardian.com/australia-news/202...,"Robertson is a small, pretty town perched on t...",2022-06-21T17:30:09Z,149837
149837,australia-news/2022/jun/22/power-to-ban-citize...,Australia news,Power to ban citizens from re-entering Austral...,https://www.theguardian.com/australia-news/202...,A high court decision striking down the home a...,2022-06-21T17:30:08Z,149838


In [None]:
dataset = load_dataset('guardian_authorship', 'cross_topic_1')

Found cached dataset guardian_authorship (/scratch/data_jz17d/hf_datasets_cache/guardian_authorship/cross_topic_1/1.0.0/8c5f5675c8658367fcec31c02ac32be3a671d3eee703862e92b84b6b61e4fb38)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
set(dataset['train']['author'])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}

In [None]:
def create_guardian_subdataset(num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = '../../data/CCAT50/processed/'):
    assert num_authors_to_pick is None or picked_author_ids is None, "either specify 'num_authors_to_pick' or 'picked_author_ids'"
    
    df = pd.read_csv('../../data/CCAT50/processed/CCAT50_train.csv')
    df_val = pd.read_csv('../../data/CCAT50/processed/CCAT50_AA_val.csv')
    
    if not picked_author_ids:
        unique_authors = list(df['author_id'].unique())
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    
    for split in ['train', 'val']:
        if split == 'train':
            df_ = df
        else:
            df_ = df_val
        authors = []
        texts = []
        for author in picked_author_ids:
            df_temp = df_[df_['author_id'] == author]
            for i_doc in range(len(df_temp)):
                doc = df_temp['text'].iloc[i_doc].split('\n')
                for i in range(len(doc)):
                    doc[i] = doc[i].strip()
                while '' in doc:
                    doc.remove('')
                for i in range(len(doc)-num_sent_per_text):
                    authors.append(author)
                    texts.append(' '.join(doc[i:i+num_sent_per_text]))
        df_new = pd.DataFrame({'author':authors, 'text':texts})
        df_new = add_dep_edges(df_new)
        str_author = ','.join(map(str, picked_author_ids))
        file_name = f"author_{str_author}_sent_{num_sent_per_text}_{split}.csv"
        df_new.to_csv(f"{save_folder}/{file_name}", index=False)

In [None]:
import spacy
text = 'How about two sentences? This is the second sentence. ($1=.6161 Pound)'

doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents]


In [None]:
sentences

['How about two sentences?',
 'This is the second sentence.',
 '($1=.6161 Pound)']

## imdb

In [None]:
def create_imdb_subdataset(num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = '../../data/imdb/processed/'):
    assert num_authors_to_pick is None or picked_author_ids is None, "either specify 'num_authors_to_pick' or 'picked_author_ids'"
    
    df = pd.read_csv('../../data/imdb/processed/imdb62_train.csv')
    df_val = pd.read_csv('../../data/imdb/processed/imdb62_AA_val.csv')
    
    if not picked_author_ids:
        unique_authors = list(df['author_id'].unique())
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    
    for split in ['train', 'val']:
        if split == 'train':
            df_ = df
        else:
            df_ = df_val
        authors = []
        texts = []
        for author in picked_author_ids:
            df_temp = df_[df_['author_id'] == author]
            for i_doc in range(len(df_temp)):
                doc = nlp(df_temp['text'].iloc[i_doc])
                doc = [sent.text.strip() for sent in doc.sents]
                while '' in doc:
                    doc.remove('')
                for i in range(len(doc)-num_sent_per_text):
                    authors.append(author)
                    texts.append(' '.join(doc[i:i+num_sent_per_text]))
        df_new = pd.DataFrame({'author':authors, 'text':texts})
        df_new = add_dep_edges(df_new)
        str_author = ','.join(map(str, picked_author_ids))
        file_name = f"author_{str_author}_sent_{num_sent_per_text}_{split}.csv"
        df_new.to_csv(f"{save_folder}/{file_name}", index=False)

In [None]:
ids = [[0,1],[2,3],[4,5]]
num_sent_per_text = 2
save_folder = '../../data/imdb/processed/'

for picked_author_ids in ids:
    create_imdb_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text, save_folder=save_folder)


  0%|          | 0/17100 [00:00<?, ?it/s]

  0%|          | 0/5846 [00:00<?, ?it/s]

  0%|          | 0/20749 [00:00<?, ?it/s]

  0%|          | 0/6894 [00:00<?, ?it/s]