In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import random
import plotly.express as px
import warnings
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")


random.seed(42)

The following cell contains code for creating the author vector csv

In [2]:
doc_df = pd.read_csv("data/features/document_vectors.csv")
doc_values = doc_df.loc[:, ~doc_df.columns.isin(['doc_id', 'author_id'])]
author_ids = set(doc_df["author_id"])

def make_author_vector(doc_vectors:np.ndarray) -> np.ndarray:
    return np.mean(doc_vectors, axis=0)

def make_author_vector_df(doc_df:pd.DataFrame, author_ids) -> pd.DataFrame:
    """Creates author vectors by averaging each author's documents into one"""
    df_copy = doc_df.copy(deep=True).drop(columns="author_id").drop(columns="doc_id")
    
    author_ids_to_avs = {}
    for author_id in author_ids:
        doc_vectors = df_copy.loc[doc_df['author_id'] == author_id].values
        author_ids_to_avs[author_id] = make_author_vector(doc_vectors)
        
    av_df = pd.DataFrame(author_ids_to_avs).T
    av_df.columns = df_copy.columns

    
    return av_df

    


In [5]:
from scipy.stats import zscore
from components.processing import author_vectors, authors_df


def get_threshold_zscores_idxs(zscores, threshold:float):
    """Gets indices for |zscores| that meet a threshold"""
    selected = []
    for i, zscore in enumerate(zscores):
        if abs(zscore) > threshold:
            selected.append(i)
    return selected


def get_author_identifying_features(author_id:str, threshold=2.0):
    """
    Given an author calculates their zscores for all features and selects the ones that deviate the most from the 
    mean. These features are what separate this author from the average author
    """
    zscores = zscore(author_vectors)
    author_idx = authors_df.loc[authors_df["author_id"] == author_id].index[0]
    author_zscores = zscores.iloc[author_idx]
    
    selected_zscores = get_threshold_zscores_idxs(author_zscores, threshold)
    return author_zscores.iloc[selected_zscores]



get_author_identifying_features("en_114")


POS Unigram: NUM                    3.456512
POS Unigram: X                      5.234894
POS Bigram: ('VERB', 'NOUN')       -2.361870
Function word: them                 3.342639
Function word: am                   6.432398
Function word: do                   2.188410
Function word: an                   2.144364
Function word: for                  4.925914
Function word: when                 2.083991
Punctuation: "                      2.221971
Letter: d                           3.065809
Letter: o                          -2.144378
Letter: A                           4.125225
Letter: K                           2.338211
Document statistic: word_len_std    2.403748
Dependency label: nummod            2.724182
Mixed Bigram: ('NOUN', 'of')       -2.066344
Mixed Bigram: ('.', 'VERB')         2.258779
Mixed Bigram: ('an', 'NOUN')        2.362862
Name: 49, dtype: float64

1