In [None]:
from nltk.parse.corenlp import CoreNLPParser,CoreNLPDependencyParser
import numpy as np
import random
import time
import pandas as pd
from tqdm.auto import tqdm, trange

In [None]:
depparser = CoreNLPDependencyParser(url='http://localhost:9000')

In [None]:
# this conversion comes from https://universaldependencies.org/tagset-conversion/en-penn-uposf.html
# however, it's just impossible to convert to upos without knowing the context. 
# Manning's word here: https://github.com/UniversalDependencies/docs/issues/212#issuecomment-148846154
# how to use Manning's converter: https://github.com/clulab/processors/wiki/Converting-from-Penn-Treebank-to-Basic-Stanford-Dependencies
xpos2upos = {'#': 'SYM',
             '$': 'SYM',
             "''": 'PUNCT',
             ',': 'PUNCT',
             '-LRB-': 'PUNCT',
             '-RRB-': 'PUNCT',
             '.': 'PUNCT',
             ':': 'PUNCT',
             'AFX': 'ADJ',
             'CC': 'CCONJ',
             'CD': 'NUM',
             'DT': 'DET',
             'EX': 'PRON',
             'FW': 'X',
             'HYPH': 'PUNCT',
             'IN': 'ADP',
             'JJ': 'ADJ',
             'JJR': 'ADJ',
             'JJS': 'ADJ',
             'LS': 'X',
             'MD': 'VERB',
             'NFP': 'PUNCT', # manually added. 
             'NIL': 'X',
             'NN': 'NOUN',
             'NNP': 'PROPN',
             'NNPS': 'PROPN',
             'NNS': 'NOUN',
             'PDT': 'DET',
             'POS': 'PART',
             'PRP': 'PRON',
             'PRP$': 'DET',
             'RB': 'ADV',
             'RBR': 'ADV',
             'RBS': 'ADV',
             'RP': 'ADP',
             'SYM': 'SYM',
             'TO': 'PART',
             'UH': 'INTJ',
             'VB': 'VERB',
             'VBD': 'VERB',
             'VBG': 'VERB',
             'VBN': 'VERB',
             'VBP': 'VERB',
             'VBZ': 'VERB',
             'WDT': 'DET',
             'WP': 'PRON',
             'WP$': 'DET',
             'WRB': 'ADV',
             '``': 'PUNCT'}
def convert_pos_seq(pos_seq):
    upos_seq = []
    for token in pos_seq:
        upos_seq.append(xpos2upos[token])
    return upos_seq

In [None]:
def add_dep_edges(df):
    texts = df['text']
    homo_edges = []
    hetoro_edges = []
    pos_seqs = []
    upos_seqs = []
    
    for text in tqdm(texts):
        parsed = depparser.raw_parse(text)
        conll_dep = next(parsed).to_conll(4)
        lines = conll_dep.split('\n')
        homo_edge = []
        hetoro_edge = []
        pos_seq = []
        for i,line in enumerate(lines[:-1]):
            l = line.split('\t')
            homo_edge.append([i+1, int(l[2])])
            hetoro_edge.append(l[3])
            pos_seq.append(l[1])
        homo_edges.append(homo_edge)
        hetoro_edges.append(hetoro_edge)
        pos_seqs.append(pos_seq)
        upos_seqs.append(convert_pos_seq(pos_seq))
    df['homo_edges'] = homo_edges
    df['hetoro_edges'] = hetoro_edges
    df['pos_seqs'] = pos_seqs
    df['upos_seqs'] = upos_seqs
    
    return df

In [None]:
def create_ccat50_subdataset(num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = '../../data/CCAT50/processed/'):
    assert num_authors_to_pick is None or picked_author_ids is None, "either specify 'num_authors_to_pick' or 'picked_author_ids'"
    
    df_ccat = pd.read_csv('../../data/CCAT50/processed/CCAT50_train.csv')
    df_ccat_val = pd.read_csv('../../data/CCAT50/processed/CCAT50_AA_val.csv')
    
    if not picked_author_ids:
        unique_authors = list(df_ccat['author_id'].unique())
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    
    for split in ['train', 'val']:
        if split == 'train':
            df = df_ccat
        else:
            df = df_ccat_val
        authors = []
        texts = []
        for author in picked_author_ids:
            df_temp = df[df['author_id'] == author]
            for i_doc in range(len(df_temp)):
                doc = df_temp['text'].iloc[i_doc].split('\n')
                for i in range(len(doc)):
                    doc[i] = doc[i].strip()
                doc.remove('')
                for i in range(len(doc)-num_sent_per_text):
                    authors.append(author)
                    texts.append(' '.join(doc[i:i+num_sent_per_text]))
        df_new = pd.DataFrame({'author':authors, 'text':texts})
        df_new = add_dep_edges(df_new)
        str_author = ','.join(map(str, picked_author_ids))
        file_name = f"author_{str_author}_sent_{num_sent_per_text}_{split}.csv"
        df_new.to_csv(f"{save_folder}/{file_name}", index=False)

In [None]:
ids = [[0,1],[2,3],[4,5]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/1293 [00:00<?, ?it/s]

  0%|          | 0/330 [00:00<?, ?it/s]

  0%|          | 0/1283 [00:00<?, ?it/s]

  0%|          | 0/309 [00:00<?, ?it/s]

  0%|          | 0/1175 [00:00<?, ?it/s]

  0%|          | 0/297 [00:00<?, ?it/s]

In [None]:
ids = [[0,1,2],[3,4,5],[6,7,8]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/1818 [00:00<?, ?it/s]

  0%|          | 0/452 [00:00<?, ?it/s]

  0%|          | 0/1843 [00:00<?, ?it/s]

  0%|          | 0/485 [00:00<?, ?it/s]

In [None]:
ids = [[0,1,2,3],[4,5,6,7],[8,9,10,11]]
num_sent_per_text = 2

for picked_author_ids in ids:
    create_ccat50_subdataset(num_authors_to_pick=None, picked_author_ids=picked_author_ids, num_sent_per_text=num_sent_per_text)


  0%|          | 0/2576 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/2457 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/2451 [00:00<?, ?it/s]

  0%|          | 0/577 [00:00<?, ?it/s]