In [1]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.graph_objects as go
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
from collections import Counter
from typing import List, Set
warnings.filterwarnings("ignore")


In [7]:
doc_df = pd.read_csv("data/document_vectors.csv", index_col="doc_id")
doc_values = doc_df.loc[:, ~doc_df.columns.isin(['doc_id', 'author_id'])]
author_ids = set(doc_df["author_id"])

In [8]:

def make_author_vector(doc_vectors:np.ndarray) -> np.ndarray:
    return np.mean(doc_vectors, axis=0)

def make_author_vector_df(doc_df:pd.DataFrame, author_ids:List[str]) -> pd.DataFrame:
    """Creates author vectors by averaging each author's documents into one"""
    df_copy = doc_df.copy(deep=True).drop(columns="author_id")
    
    author_ids_to_avs = {}
    for author_id in author_ids:
        doc_vectors = df_copy.loc[doc_df['author_id'] == author_id].values
        author_ids_to_avs[author_id] = make_author_vector(doc_vectors)
        
    av_df = pd.DataFrame(author_ids_to_avs).T
    av_df.columns = df_copy.columns

    
    return av_df

    
av_df = make_author_vector_df(doc_df, author_ids)
av_values = av_df.loc[:, ~av_df.columns.isin(['author_id'])]



In [10]:
doc_kmeans = KMeans(n_clusters=6, random_state=42)
doc_kmeans.fit(doc_values.values)


doc_kmeans.labels_

array([4, 0, 4, ..., 4, 4, 0], dtype=int32)

In [24]:

df = pd.read_csv('data/author_vectors.csv')

df.iloc[3].author_id


'en_112'

Unnamed: 0_level_0,author_id,POS Unigram: ADJ,POS Unigram: ADP,POS Unigram: ADV,POS Unigram: AUX,POS Unigram: CCONJ,POS Unigram: DET,POS Unigram: INTJ,POS Unigram: NOUN,POS Unigram: NUM,...,Morphology tag: Part,Morphology tag: Inf,Morphology tag: Ger,Morphology tag: Pres,Morphology tag: Past,Morphology tag: Prog,Morphology tag: Perf,Morphology tag: 1,Morphology tag: 3,Morphology tag: 2
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
doc_1,en_110,0.081566,0.076672,0.071778,0.061990,0.029364,0.066884,0.008157,0.168026,0.004894,...,0.017751,0.031953,0.002367,0.054438,0.041420,0.009467,0.008284,0.036686,0.029586,0.033136
doc_2,en_112,0.043478,0.095652,0.069565,0.078261,0.026087,0.104348,0.017391,0.130435,0.000000,...,0.028736,0.045977,0.000000,0.040230,0.028736,0.022989,0.005747,0.034483,0.040230,0.017241
doc_3,en_112,0.038356,0.098630,0.052055,0.084932,0.024658,0.060274,0.016438,0.145205,0.005479,...,0.042435,0.035055,0.001845,0.053506,0.033210,0.023985,0.016605,0.064576,0.018450,0.011070
doc_4,en_76,0.048780,0.065041,0.048780,0.040650,0.008130,0.048780,0.089431,0.121951,0.016260,...,0.024096,0.024096,0.000000,0.078313,0.018072,0.018072,0.006024,0.036145,0.066265,0.000000
doc_5,en_62,0.091875,0.088447,0.053480,0.053480,0.032911,0.069249,0.000000,0.174837,0.012341,...,0.041563,0.035783,0.001652,0.076246,0.024498,0.020369,0.019818,0.000551,0.055051,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_1043,en_67,0.060870,0.060870,0.078261,0.052174,0.008696,0.069565,0.034783,0.208696,0.008696,...,0.033113,0.072848,0.000000,0.066225,0.019868,0.026490,0.006623,0.033113,0.033113,0.026490
doc_1044,en_114,0.045000,0.065000,0.072500,0.102500,0.020000,0.045000,0.015000,0.105000,0.007500,...,0.020690,0.046552,0.000000,0.062069,0.027586,0.012069,0.008621,0.074138,0.029310,0.010345
doc_1045,en_57,0.057935,0.085642,0.042821,0.090680,0.022670,0.057935,0.025189,0.115869,0.007557,...,0.026270,0.036778,0.001751,0.057793,0.031524,0.015762,0.008757,0.056042,0.031524,0.019264
doc_1046,en_114,0.026915,0.080745,0.053830,0.089027,0.026915,0.041408,0.022774,0.111801,0.012422,...,0.030343,0.039578,0.000000,0.069921,0.022427,0.018470,0.009235,0.051451,0.034301,0.030343
