In [1]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.graph_objects as go
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
from collections import Counter
from typing import List, Set
warnings.filterwarnings("ignore")


In [2]:
doc_df = pd.read_csv("data/features/document_vectors.csv", index_col="doc_id")
doc_values = doc_df.loc[:, ~doc_df.columns.isin(['doc_id', 'author_id'])]
author_ids = set(doc_df["author_id"])


In [8]:

def make_author_vector(doc_vectors:np.ndarray) -> np.ndarray:
    return np.mean(doc_vectors, axis=0)

def make_author_vector_df(doc_df:pd.DataFrame, author_ids:List[str]) -> pd.DataFrame:
    """Creates author vectors by averaging each author's documents into one"""
    df_copy = doc_df.copy(deep=True).drop(columns="author_id")
    
    author_ids_to_avs = {}
    for author_id in author_ids:
        doc_vectors = df_copy.loc[doc_df['author_id'] == author_id].values
        author_ids_to_avs[author_id] = make_author_vector(doc_vectors)
        
    av_df = pd.DataFrame(author_ids_to_avs).T
    av_df.columns = df_copy.columns

    
    return av_df

    
av_df = make_author_vector_df(doc_df, author_ids)
av_values = av_df.loc[:, ~av_df.columns.isin(['author_id'])]



In [10]:
doc_kmeans = KMeans(n_clusters=6, random_state=42)
doc_kmeans.fit(doc_values.values)


doc_kmeans.labels_

array([4, 0, 4, ..., 4, 4, 0], dtype=int32)

In [11]:

df = pd.read_csv("data/features/author_vectors.csv")


df.loc[:, ~df.columns.isin(['author_id'])]

POS Unigram: ADJ        0.054614
POS Unigram: ADP        0.074745
POS Unigram: ADV        0.053280
POS Unigram: AUX        0.075881
POS Unigram: CCONJ      0.025977
                          ...   
Morphology tag: Prog    0.013577
Morphology tag: Perf    0.012061
Morphology tag: 1       0.042004
Morphology tag: 3       0.040013
Morphology tag: 2       0.015983
Length: 428, dtype: float64

In [None]:
test_author = "en_110"

df = pd.read_json()