In [1]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.graph_objects as go
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
from collections import Counter
from typing import List, Set
warnings.filterwarnings("ignore")


In [2]:
doc_df = pd.read_csv("data/document_vectors.csv", index_col="doc_id")
doc_values = doc_df.loc[:, ~doc_df.columns.isin(['doc_id', 'author_id'])]
author_ids = set(doc_df["author_id"])


In [8]:

def make_author_vector(doc_vectors:np.ndarray) -> np.ndarray:
    return np.mean(doc_vectors, axis=0)

def make_author_vector_df(doc_df:pd.DataFrame, author_ids:List[str]) -> pd.DataFrame:
    """Creates author vectors by averaging each author's documents into one"""
    df_copy = doc_df.copy(deep=True).drop(columns="author_id")
    
    author_ids_to_avs = {}
    for author_id in author_ids:
        doc_vectors = df_copy.loc[doc_df['author_id'] == author_id].values
        author_ids_to_avs[author_id] = make_author_vector(doc_vectors)
        
    av_df = pd.DataFrame(author_ids_to_avs).T
    av_df.columns = df_copy.columns

    
    return av_df

    
av_df = make_author_vector_df(doc_df, author_ids)
av_values = av_df.loc[:, ~av_df.columns.isin(['author_id'])]



In [10]:
doc_kmeans = KMeans(n_clusters=6, random_state=42)
doc_kmeans.fit(doc_values.values)


doc_kmeans.labels_

array([4, 0, 4, ..., 4, 4, 0], dtype=int32)

In [16]:
doc_df.loc[doc_df["POS Unigram: ADJ"] == 0.081566]


Unnamed: 0_level_0,author_id,POS Unigram: ADJ,POS Unigram: ADP,POS Unigram: ADV,POS Unigram: AUX,POS Unigram: CCONJ,POS Unigram: DET,POS Unigram: INTJ,POS Unigram: NOUN,POS Unigram: NUM,...,Morphology tag: Part,Morphology tag: Inf,Morphology tag: Ger,Morphology tag: Pres,Morphology tag: Past,Morphology tag: Prog,Morphology tag: Perf,Morphology tag: 1,Morphology tag: 3,Morphology tag: 2
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
doc_1,en_110,0.081566,0.076672,0.071778,0.06199,0.029364,0.066884,0.008157,0.168026,0.004894,...,0.017751,0.031953,0.002367,0.054438,0.04142,0.009467,0.008284,0.036686,0.029586,0.033136
doc_2,en_112,0.043478,0.095652,0.069565,0.078261,0.026087,0.104348,0.017391,0.130435,0.0,...,0.028736,0.045977,0.0,0.04023,0.028736,0.022989,0.005747,0.034483,0.04023,0.017241
doc_3,en_112,0.038356,0.09863,0.052055,0.084932,0.024658,0.060274,0.016438,0.145205,0.005479,...,0.042435,0.035055,0.001845,0.053506,0.03321,0.023985,0.016605,0.064576,0.01845,0.01107
doc_4,en_76,0.04878,0.065041,0.04878,0.04065,0.00813,0.04878,0.089431,0.121951,0.01626,...,0.024096,0.024096,0.0,0.078313,0.018072,0.018072,0.006024,0.036145,0.066265,0.0
doc_5,en_62,0.091875,0.088447,0.05348,0.05348,0.032911,0.069249,0.0,0.174837,0.012341,...,0.041563,0.035783,0.001652,0.076246,0.024498,0.020369,0.019818,0.000551,0.055051,0.0
