In [14]:
import pandas as pd
import numpy as np
from typing import List, Dict
import matplotlib.pyplot as plt
import random
import plotly.express as px
import warnings
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")


random.seed(42)

The following cell contains code for creating the author vector csv

In [2]:
doc_df = pd.read_csv("data/features/document_vectors.csv")
doc_values = doc_df.loc[:, ~doc_df.columns.isin(['doc_id', 'author_id'])]
author_ids = set(doc_df["author_id"])

def make_author_vector(doc_vectors:np.ndarray) -> np.ndarray:
    return np.mean(doc_vectors, axis=0)

def make_author_vector_df(doc_df:pd.DataFrame, author_ids) -> pd.DataFrame:
    """Creates author vectors by averaging each author's documents into one"""
    df_copy = doc_df.copy(deep=True).drop(columns="author_id").drop(columns="doc_id")
    
    author_ids_to_avs = {}
    for author_id in author_ids:
        doc_vectors = df_copy.loc[doc_df['author_id'] == author_id].values
        author_ids_to_avs[author_id] = make_author_vector(doc_vectors)
        
    av_df = pd.DataFrame(author_ids_to_avs).T
    av_df.columns = df_copy.columns

    
    return av_df

    


In [50]:
from scipy.stats import zscore
from components.processing import author_vectors, authors_df, docs_df


def get_threshold_zscores_idxs(zscores, threshold:float):
    """Gets indices for |zscores| that meet a threshold"""
    selected = []
    for i, zscore in enumerate(zscores):
        if abs(zscore) >= threshold:
            selected.append(i)
    return selected


def get_identifying_features(author_id:str, threshold=2.0):
    """
    Given an author, calculates their zscores for all features and selects the ones that deviate the most from the 
    mean. These features are what separate this author from the average author
    """
    zscores = zscore(author_vectors)
    author_idx = authors_df.loc[authors_df["author_id"] == author_id].index[0]
    author_zscores = zscores.iloc[author_idx]
    
    selected_zscores = get_threshold_zscores_idxs(author_zscores, threshold)
    return author_zscores.iloc[selected_zscores]


def get_author_entries(author_id:str) -> pd.DataFrame:
    return docs_df.loc[docs_df["author_id"] == author_id]

def features_to_show(author_id:str) -> List[str]:
    """Given an author id, returns n amount of this author's most identifying features"""
    features = get_identifying_features(author_id).index.to_list()
    if len(features) > 10:
        return features[:12]
    return features

def style_pcp_label(label:str) -> str:
    """Prepends a feature with <br> for better styling"""
    if label.count(":") > 1:
        return label
    else:    
        feat_type, feat = label.split(":")
        feat = "<br>" + feat
        return f"{feat_type}:{feat}"

        
            
    


    

    
author = "en_112"
author_features = features_to_show(author)
author_entries = get_author_entries(author)

fig = px.parallel_coordinates(
    author_entries,
    dimensions=author_features,
    labels={label:style_pcp_label(label) for label in author_features}
)
fig.update_layout(
    font=dict(
        size=10,  # Set the font size here
    )
)

fig.show()


['POS Unigram:<br> NOUN',
 "POS Bigram:<br> ('VERB', 'PRON')",
 "POS Bigram:<br> ('NOUN', 'ADP')",
 "POS Bigram:<br> ('VERB', 'DET')",
 "POS Bigram:<br> ('PROPN', 'PUNCT')",
 'Function word:<br> me',
 'Function word:<br> yourself',
 'Function word:<br> these',
 'Function word:<br> doing',
 'Function word:<br> further']