In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import random
import plotly.express as px
import warnings
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")


random.seed(42)




The following cell contains code for creating the author vector csv

In [2]:
doc_df = pd.read_csv("data/features/document_vectors.csv")
doc_values = doc_df.loc[:, ~doc_df.columns.isin(['doc_id', 'author_id'])]
author_ids = set(doc_df["author_id"])

def make_author_vector(doc_vectors:np.ndarray) -> np.ndarray:
    return np.mean(doc_vectors, axis=0)

def make_author_vector_df(doc_df:pd.DataFrame, author_ids) -> pd.DataFrame:
    """Creates author vectors by averaging each author's documents into one"""
    df_copy = doc_df.copy(deep=True).drop(columns="author_id").drop(columns="doc_id")
    
    author_ids_to_avs = {}
    for author_id in author_ids:
        doc_vectors = df_copy.loc[doc_df['author_id'] == author_id].values
        author_ids_to_avs[author_id] = make_author_vector(doc_vectors)
        
    av_df = pd.DataFrame(author_ids_to_avs).T
    av_df.columns = df_copy.columns

    
    return av_df

    


In [3]:
from scipy.stats import zscore
from components.processing import author_vectors, authors_df




def get_author_identifying_features(author_id:str, threshold=2):
    """Direction of zscore is important for verbalizing, use absolute value to choose what to talk about"""
    
    zscores = abs(zscore(author_vectors))
    idx = authors_df.loc[authors_df["author_id"] == author_id].index[0]
    author_zscores = zscores.iloc[idx]
    
    return author_zscores.loc[author_zscores > threshold]



get_author_identifying_features("en_113")


POS Unigram: NOUN                 2.474801
POS Bigram: ('VERB', 'PRON')      2.518766
POS Bigram: ('NOUN', 'ADP')       2.181848
POS Bigram: ('VERB', 'DET')       2.051311
POS Bigram: ('PROPN', 'PUNCT')    2.440638
Function word: me                 3.239219
Function word: yourself           4.178343
Function word: these              3.296460
Function word: doing              2.906452
Function word: further            2.884852
Function word: all                3.656222
Function word: too                2.034635
Function word: won                4.928485
Letter: f                         2.091393
Letter: G                         3.353575
Letter: K                         3.649688
Letter: R                         4.330837
Emoji: ❤️                         2.443363
Dependency label: dative          2.546225
Mixed Bigram: ('.', 'INTJ')       2.345270
Mixed Bigram: ('ADJ', 'to')       2.633592
Mixed Bigram: ('VERB', 'you')     2.684187
Morphology tag: Acc               2.701221
Name: 1, dt

In [3]:

for file in Path("data/wordclouds/").glob("*"):
    print(file)

data/wordclouds/en_76_wc.png
data/wordclouds/en_19_wc.png
data/wordclouds/en_52_wc.png
data/wordclouds/en_97_wc.png
data/wordclouds/en_112_wc.png
data/wordclouds/en_100_wc.png
data/wordclouds/en_99_wc.png
data/wordclouds/en_102_wc.png
data/wordclouds/en_110_wc.png
data/wordclouds/en_78_wc.png
data/wordclouds/en_2_wc.png
data/wordclouds/en_66_wc.png
data/wordclouds/en_74_wc.png
data/wordclouds/en_21_wc.png
data/wordclouds/en_114_wc.png
data/wordclouds/en_37_wc.png
data/wordclouds/en_54_wc.png
data/wordclouds/en_58_wc.png
data/wordclouds/en_13_wc.png
data/wordclouds/en_62_wc.png
data/wordclouds/en_56_wc.png
data/wordclouds/en_35_wc.png
data/wordclouds/en_60_wc.png
data/wordclouds/en_4_wc.png
data/wordclouds/en_72_wc.png
data/wordclouds/en_11_wc.png
data/wordclouds/en_104_wc.png
data/wordclouds/en_108_wc.png
data/wordclouds/en_113_wc.png
data/wordclouds/en_101_wc.png
data/wordclouds/en_96_wc.png
data/wordclouds/en_53_wc.png
data/wordclouds/en_22_wc.png
data/wordclouds/en_77_wc.png
data/wo

In [6]:
authors_df.iloc[0].author_id

'en_35'