In [1]:
from wordcloud import WordCloud, ImageColorGenerator
import plotly.graph_objects as go
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import random
import plotly.express as px
import warnings
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")


random.seed(42)




The following cell contains code for creating the author vector csv

In [2]:
doc_df = pd.read_csv("data/features/document_vectors.csv")
doc_values = doc_df.loc[:, ~doc_df.columns.isin(['doc_id', 'author_id'])]
author_ids = set(doc_df["author_id"])

def make_author_vector(doc_vectors:np.ndarray) -> np.ndarray:
    return np.mean(doc_vectors, axis=0)

def make_author_vector_df(doc_df:pd.DataFrame, author_ids) -> pd.DataFrame:
    """Creates author vectors by averaging each author's documents into one"""
    df_copy = doc_df.copy(deep=True).drop(columns="author_id").drop(columns="doc_id")
    
    author_ids_to_avs = {}
    for author_id in author_ids:
        doc_vectors = df_copy.loc[doc_df['author_id'] == author_id].values
        author_ids_to_avs[author_id] = make_author_vector(doc_vectors)
        
    av_df = pd.DataFrame(author_ids_to_avs).T
    av_df.columns = df_copy.columns

    
    return av_df

    


In [3]:
from scipy.stats import zscore
from components.processing import author_vectors, authors_df




def get_author_identifying_features(author_id:str, threshold=2):
    """Only check abs value when looking for thresold?"""
    
    zscores = zscore(author_vectors)
    idx = authors_df.loc[authors_df["author_id"] == author_id].index[0]
    author_zscores = zscores.iloc[idx]
    
    return author_zscores.loc[author_zscores > threshold]



get_author_identifying_features("en_113")


POS Bigram: ('VERB', 'PRON')      2.518766
POS Bigram: ('PROPN', 'PUNCT')    2.440638
Function word: me                 3.239219
Function word: yourself           4.178343
Function word: these              3.296460
Function word: doing              2.906452
Function word: further            2.884852
Function word: all                3.656222
Function word: too                2.034635
Function word: won                4.928485
Letter: G                         3.353575
Letter: K                         3.649688
Letter: R                         4.330837
Emoji: ❤️                         2.443363
Dependency label: dative          2.546225
Mixed Bigram: ('.', 'INTJ')       2.345270
Mixed Bigram: ('ADJ', 'to')       2.633592
Mixed Bigram: ('VERB', 'you')     2.684187
Morphology tag: Acc               2.701221
Name: 1, dtype: float64

['en_35',
 'en_113',
 'en_20',
 'en_112',
 'en_76',
 'en_66',
 'en_63',
 'en_105',
 'en_56',
 'en_55',
 'en_13',
 'en_37',
 'en_59',
 'en_109',
 'en_107',
 'en_60',
 'en_21',
 'en_67',
 'en_58',
 'en_78',
 'en_51',
 'en_73',
 'en_96',
 'en_61',
 'en_12',
 'en_54',
 'en_99',
 'en_104',
 'en_101',
 'en_75',
 'en_97',
 'en_4',
 'en_62',
 'en_110',
 'en_111',
 'en_36',
 'en_53',
 'en_11',
 'en_5',
 'en_74',
 'en_98',
 'en_18',
 'en_2',
 'en_102',
 'en_72',
 'en_52',
 'en_108',
 'en_57',
 'en_34',
 'en_114',
 'en_19',
 'en_103',
 'en_22',
 'en_100',
 'en_3',
 'en_77']

In [69]:
authors_df.loc[authors_df["author_id"] == "en_110"].index[0]

33