In [1]:
import pandas as pd
import numpy as np
from typing import List

In [34]:
docs = pd.read_csv("data/pan/document_vectors.csv", index_col="documentID")

def get_author_ids(doc_df:pd.DataFrame) -> np.ndarray:
    return doc_df.authorIDs.unique()

def create_author_vector(author_id:str, doc_df:pd.DataFrame) -> pd.Series:
    author_document_vectors = doc_df.loc[doc_df['authorIDs'] == author_id]
    return author_document_vectors.mean(axis=0, numeric_only=True)

def create_author_vector_df(doc_df:pd.DataFrame) -> pd.DataFrame:
    """Creates author vectors by averaging each author's documents into one"""
    author_ids = get_author_ids(doc_df)
    author_ids_to_avs = {}
    for author_id in author_ids:
        author_ids_to_avs[author_id] = create_author_vector(author_id, doc_df)    
    av_df = pd.DataFrame(author_ids_to_avs).T
    return av_df


authors = create_author_vector_df(docs)




In [9]:
HIGH_LEVEL_FEATURES = ["pos_unigrams", "pos_bigrams", "letters", "emojis", "mixed_bigrams", "morph_tags", "dep_labels","punctuation", "func_words"]

def create_feature_dfs(df:pd.DataFrame) -> List[pd.DataFrame]:
    return [df.filter(regex=f"{feat_name}") for feat_name in HIGH_LEVEL_FEATURES]




dfs = create_feature_dfs(authors)

In [23]:
dfs[4] = dfs[4].iloc[:, 0:1000]



In [25]:
dfs[4].shape

(56, 1000)