# [CultureHackLabs](https://github.com/culturehacklabs)

> Mayner WGP, Marshall W, Albantakis L, Findlay G, Marchman R, Tononi G. [PyPhi: A toolbox for integrated information theory](https://doi.org/10.1371/journal.pcbi.1006343). PLOS Computational Biology 14(7): e1006343. 2018.

## Load requirements

### Libraries

In [None]:
!pip install -q graphviz kaleido langdetect matplotlib networkx nltk numpy pandas plotly pydot pyphi scikit-learn scipy # pygraphviz
!wget -q https://gist.githubusercontent.com/nelsonaloysio/2eb360eeffacdcb3f8ad305ab85dc398/raw/eeab6d6050b53142810aacde4f4bb1296c620f7b/iso639-1.json
!wget -q https://gist.githubusercontent.com/nelsonaloysio/302dbbf3963fababde6e9f97669587df/raw/0f0523749a30ded1422a69103547bae7dddc8933/stopwords.py

In [None]:
import json
import logging as log
import math
import os
import re
import string
from abc import ABCMeta, abstractmethod
from functools import reduce
from inspect import signature
from itertools import combinations
from math import atan2
from math import log as log10
from math import pi
from re import findall
from shutil import move
from subprocess import call
from time import time
from typing import Callable, Union
from urllib.request import urlopen
os.environ["PYPHI_WELCOME_OFF"] = "yes"

import graphviz as gv
import matplotlib.pyplot as plt
import networkx as nx
import nltk
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio
import plotly.offline as py
import pyphi
from langdetect import detect as lang_detect
from langdetect.detector import LangDetectException
from nltk.stem.snowball import SnowballStemmer
from plotly.graph_objs import Figure
from plotly.subplots import make_subplots
from sklearn.base import TransformerMixin
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.exceptions import NotFittedError
from sklearn.feature_extraction.text import (HashingVectorizer,
                                             TfidfTransformer,
                                             TfidfVectorizer)
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer

from stopwords import CUSTOM_STOPWORDS

try:
    import pygraphviz as pgv
except ModuleNotFoundError as e:
    log.warning(e)

nltk.download("punkt")
nltk.download("stopwords")
pio.templates.default = "none"

In [None]:
ACCENT_REPLACEMENTS = {
    ord("á"): "a", ord("ã"): "a", ord("â"): "a",
    ord("à"): "a", ord("è"): "e", ord("ê"): "e",
    ord("é"): "e", ord("í"): "i", ord("ì"): "i",
    ord("ñ"): "n", ord("ò"): "o", ord("ó"): "o",
    ord("ô"): "o", ord("õ"): "o", ord("ù"): "u",
    ord("ú"): "u", ord("ü"): "u", ord("ç"): "c"}

IGNORE_STARTS_WITH = ["http", "www", "kkk"]
INVALID_CHARACTERS = "\\\"'’…|–—“”‘„•¿¡"
VALID_CHARACTERS = "@#"

CHARACTER_REPLACEMENTS = str.maketrans("", "", "".join(
    set(string.punctuation + INVALID_CHARACTERS) - set(VALID_CHARACTERS)))

LABEL_MAP_FUNC = lambda x: " ".join(
    [x.capitalize() for x in (x if isinstance(x, tuple) else [x])]
)

REINDEX_MAP_FUNC = lambda x, y: {
    k: {k_: y[v_].drop_duplicates().tolist() for k_, v_ in v.items()}
    if isinstance(v, dict) else y[v].drop_duplicates().tolist() for k, v in x.items()
}

RENAME_TITLE_MAP_FUNC = lambda x, y: {
    k: {k_: ("%s %s" % (y, (v_.lower() if y is not None else v_))) if k_ == "title" and k == "layout_opts" else v_ for k_, v_ in v.items()}
    if type(v) is dict else v for k, v in x.items()
}

AUTORANGE = True
CONNECT_GAPS = False
FONT_COLOR = "grey"
FONT_FAMILY = "Raleway, Arial, sans-serif"
FONT_SIZE = 16
LEGEND_Y = 0.5
LEGEND_YREF = "paper"
MARKER_SIZE = 6
TEXT_POSITION = "top center"

with open("iso639-1.json", "r") as j:
    ISO639 = json.loads(j.read())

### Cluster pipeline

In [None]:
class Transformer(metaclass=ABCMeta):
    """
    Asbtract base transformer class.
    """
    @abstractmethod
    def __init__(self, **kwargs):
        """ Abstract initializer class. """

    def fit(self, X, y=None):
        """ Just returns class, nothing to fit. """
        return self

    @abstractmethod
    def transform(self, X):
        """ Abstract method for "DIY" transformations. """


class PandasTransformer(Transformer):
    """
    Transform path or data frame into series object.
    """
    def __init__(self,
            applymap=lambda x:x,
            column=None,
            drop_duplicates=False,
            dropna=False,
            json_lines=False,
            low_memory=False,
            sep=None,
            sort=[],
            **kwargs):

        self.applymap = applymap
        self.column = column
        self.drop_duplicates = drop_duplicates
        self.dropna = dropna
        self.json_lines = json_lines
        self.low_memory = low_memory
        self.sep = sep
        self.sort = sort

    def transform(self, path_or_df: Union[str, pd.Series, pd.DataFrame]):
        """ Returns Pandas Series from data frame or file path. """
        usecols = [self.column] + self.sort if self.column else None
        # Load data from all or specified columns
        series = path_or_df\
                 if isinstance(path_or_df, pd.Series)\
                 else pd.Series()\
                 if isinstance(path_or_df, pd.DataFrame)\
                 and path_or_df.empty\
                 else self.filter_column(
                        self.sort_values(
                          self.read_table(
                            self.read_json(
                              path_or_df
                            ),
                            sep=self.sep if self.column else "\n",
                            usecols=usecols,
                            low_memory=self.low_memory
                            ),
                          columns=self.sort
                          ),
                        column=self.column)
        # Check if type is correct
        if isinstance(series, pd.DataFrame):
            raise TypeError(f"Expected a Pandas Series or 1-dimensional DataFrame (column='{self.column}').")
        # Apply map and drop duplicates/nulls
        series = series.copy()
        if self.applymap:
            series = series.apply(self.applymap)
        if self.drop_duplicates:
            series.drop_duplicates(inplace=True)
        if self.dropna:
            series.dropna(inplace=True)
        # Store difference between indices
        self.index_ = series.index
        self.skiprows_ = self.index_.difference(series.index)
        return series.copy()

    def read_table(self, path_or_df: Union[str, list, pd.Series, pd.DataFrame],
                   sep=None, skiprows=None, usecols=None, low_memory=False):
        """ Returns or loads Pandas data frame from file path. """
        return pd.read_table(
                  path_or_df,
                  usecols=usecols if usecols else None,
                  sep=sep if sep else self._get_file_delimiter(path_or_df),
                  skiprows=skiprows,
                  low_memory=low_memory)\
               if isinstance(path_or_df, str)\
               else self.concat(
                  path_or_df,
                  usecols)\
               if isinstance(path_or_df, list)\
               else path_or_df[usecols]\
                  if usecols\
                  and any(x for x in path_or_df.shape)\
                  else path_or_df

    def read_json(self, X):
        """ Return Pandas data frame from JSON file. """
        return pd.read_json(X)\
               if isinstance(X, str)\
               and X.endswith(".json")\
               else X

    def filter_column(self, df: Union[pd.Series, pd.DataFrame], column=None):
        """ Returns Pandas series from loaded data frame. """
        return df[df.columns[column]
                  if isinstance(column, int)
                  else column]\
               if column\
               and any(x for x in df.shape)\
               else df

    def sort_values(self, df: Union[pd.Series, pd.DataFrame], columns=None):
        """ Returns Pandas series from loaded data frame. """
        return df.sort_values([
                    df.columns[column]
                    if isinstance(column, int)
                    else column
                  for column in columns],
                  ascending=False)\
               if columns\
               and any(x for x in df.shape)\
               else df

    def concat(self, dfs: list, columns=None):
        """ Return a single data frame with concatenated rows. """
        if not isinstance(dfs, list):
            return dfs

        if columns:
            dfs = [df.filter(columns) for df in dfs]

        df = pd.concat(dfs)
        df.index = range(df.shape[0])
        return df

    @staticmethod
    def _get_file_delimiter(path):
        """ Returns character delimiter from file. """
        delimiters = ["|", "\t", ";", ","]
        with open(path, "rt") as f:
            header = f.readline()
        for i in delimiters:
            if i in header:
                return i
        return "\n"


class LSA(Pipeline):
    """
    Truncated single value decomposition pipeline used for
    dimensionality reduction by latent semantic analysis.
    """
    def __init__(self, copy=False, n_components=100, random_state=None, **kwargs):
        self.copy = copy
        self.n_components = n_components
        self.random_state = random_state

        steps = (
            ("svd",
                TruncatedSVD(self.n_components,
                             random_state=self.random_state)),
            ("normalizer",
                Normalizer(copy=self.copy)))

        super().__init__(steps=steps)

    def evar(self):
        if "explained_variance_ratio_" in self.named_steps.svd.__dict__.keys():
            return f"{int(self.named_steps.svd.explained_variance_ratio_.sum()*100)}%"

        raise NotFittedError(
            "This instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")


class TextVectorizer(Pipeline):
    """
    Hashing, vectorizing and tf-idf pipelines.
    """
    def __init__(self,
                 alternate_sign=False,
                 analyzer="word",
                 max_df=1.0,
                 min_df=1,
                 n_features=None,
                 norm="l2",
                 stop_words=None,
                 use_hashing=False,
                 use_idf=True,
                 **kwargs):

        self.alternate_sign = alternate_sign
        self.analyzer = analyzer
        self.max_df = max_df
        self.min_df = min_df
        self.n_features = n_features
        self.norm = norm
        self.stop_words = stop_words
        self.use_hashing = use_hashing
        self.use_idf = use_idf

        steps = []
        if use_hashing:
            steps.append(
                ("hasher",
                 HashingVectorizer(
                     stop_words=self.stop_words,
                     n_features=self.n_features,
                     alternate_sign=self.alternate_sign,
                     norm=self.norm)
                )
            )
            if use_idf:
                steps.append(
                    ("tfidf",
                     TfidfTransformer(
                       norm=self.norm
                       )
                    )
                )
        else:
            steps.append(
                ("tfidf",
                 TfidfVectorizer(
                     analyzer=self.analyzer,
                     max_df=self.max_df,
                     min_df=self.min_df,
                     stop_words=self.stop_words,
                     max_features=self.n_features,
                     use_idf=self.use_idf)
                )
            )

        super().__init__(steps=steps)


class Tokenizer(TransformerMixin):

    def __init__(self, max_paragraphs=None, stop_words=[], **kwargs):
        self.max_paragraphs = max_paragraphs
        self.stop_words = stop_words

    def fit(self, X, y=None):
        """ Just returns class, nothing to fit. """
        return self

    def transform(self, X):
        """ Abstract method for "DIY" transformations. """
        return [
            " ".join(
                " ".join(
                    self.tokenize(sent)
                )\
                for sent in (
                    x.split("\n")[:self.max_paragraphs]
                    if isinstance(x, str) else ""
                )
            )
            for x in X
        ]

    def tokenize(self, sentence: str):
        """
        Returns word token, cleared from emojis, accents and punctuation.
        """
        return [
            x
            .replace("](", " ")
            .translate(ACCENT_REPLACEMENTS)
            .translate(CHARACTER_REPLACEMENTS)
            for x in
                self.clear_emojis(sentence)
                .lower()
                .split()
            if
                len(x) > 2
            and
                x.strip(VALID_CHARACTERS) not in self.stop_words
            and
                not self.is_number(x)
            and
                not any(x.startswith(_) for _ in IGNORE_STARTS_WITH)
        ]

    @staticmethod
    def is_number(str_word):
        """
        Check string as an integer or float.
        """
        try:
            int(str_word)
        except:
            try:
                float(str_word)
            except:
                return False
        return True

    @staticmethod
    def clear_emojis(str_text, replace_with=r' '):
        """
        Returns string after clearing from emojis.
        """
        return re\
            .compile("["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"  # extra (1)
                u"\U000024C2-\U0001F251"  # extra (2)
                u"\U0000200B-\U0000200D"  # zero width
                "]+", flags=re.UNICODE)\
            .sub(replace_with, str_text)

    @staticmethod
    def ngrams(tokens: list, n=2):
        """
        Returns n-grams from list of tokens.
        """
        return [
            g
            for g in
                list(nltk.ngrams(tokens, n) if tokens else [])
            if
                len(set(g)) == n
        ]


class Stemmer(TransformerMixin):
    """
    Returns Pandas series with stemmed words.
    """

    def __init__(self, detect_lang=False, ignore_stopwords=True, **kwargs):
        self.detect_lang = detect_lang
        self.ignore_stopwords = ignore_stopwords

    def fit(self, X, y=None):
        """ Just returns class, nothing to fit. """
        return self

    def transform(self, X, y):
        """ Returns word stems only from available languages. """
        return [
            "\n".join(
                [self._stem(_, lang) for _ in sent.split('\n')]
            )
            for sent, lang in zip(X, y)
        ]

    def _stem(self, sentence, lang=None):
        """
        Matches ISO 639-1 language code and returns stemmed words.
        Optionally tries to detect language if `lang` is set as "detect".
        """
        if (lang is None) and self.detect_lang:
            try:
                lang = lang_detect(sentence)
            except LangDetectException as e:  # No detected language
                log.debug(f"LangDetectException: {e}.")

        lang = ISO639.get(lang, lang)

        if lang in SnowballStemmer.languages:
            stemmer = SnowballStemmer(language=lang, ignore_stopwords=self.ignore_stopwords)
            return " ".join([
                stemmer.stem(w)
                for w in sentence.split()
                if not any(w.startswith(char)
                for char in VALID_CHARACTERS.split())
            ])

        log.debug(f"SnowballStemmer '{lang}' not found. Skipping...")
        return sentence


class KMeansCluster():
    """
    K-Means algorithm with optimal k-value validation
    through WCSS, silhouette scores and gap statistics.
    """
    def __init__(
        self,
        batch_size=1000,
        copy=False,
        init="k-means++",
        init_size=1000,
        knee_angle=60,
        max_iter=100,
        max_k=13,
        metric="euclidean",
        min_k=1,
        mini_batch=False,
        n_clusters=None,
        n_init=1,
        nrefs=3,
        random_state=None,
        sample_size=1000,
        validator="wcss",
        verbose=False,
        **kwargs
    ) -> None:
        self.batch_size = batch_size
        self.copy = copy
        self.init = init
        self.init_size = init_size
        self.knee_angle = knee_angle
        self.max_iter = max_iter
        self.max_k = max_k
        self.metric = metric
        self.min_k = min_k
        self.mini_batch = mini_batch
        self.n_clusters = n_clusters
        self.n_init = n_init
        self.nrefs = nrefs
        self.random_state = random_state
        self.sample_size = sample_size
        self.validator = validator
        self.verbose = verbose

    def KMeans(self, n_clusters: int):
        """
        Returns classical or Mini-Batch K-Means implementation:
        https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf
        """
        if self.mini_batch:
            return MiniBatchKMeans(n_clusters=n_clusters,
                                   batch_size=self.batch_size,
                                   init=self.init,
                                   init_size=self.init_size,
                                   n_init=self.n_init,
                                   random_state=self.random_state,
                                   verbose=self.verbose)

        return KMeans(n_clusters=n_clusters,
                      init=self.init,
                      max_iter=self.max_iter,
                      n_init=self.n_init,
                      random_state=self.random_state,
                      verbose=self.verbose)

    def KMeansOptimal(self, X, n_clusters: Union[int, list, range, None], alg="wcss"):
        """
        Returns K-Means function with optimal number of clusters
        considering their within-cluster sum of square values (WCSS),
        an approximation of the 'elbow' approach for fast clustering.
        Allows a function as validator, e.g.: silhouette() and gap().
        """
        kms = {}
        wcss = {}
        x_diff = 1

        for k in (n_clusters if n_clusters else range(self.min_k, self.max_k)):

            if k <= X.shape[0]:
                km = self.KMeans(k).fit(X)
                wcss[k] = km.inertia_/X.shape[0]
                kms[km] = wcss[k] if alg == "wcss" else alg(self, X, km)
                log.info(f"k={k}: {'{:.05f}'.format(kms[km])} ({alg if isinstance(alg, str) else alg.__name__})")

                if alg == "wcss":
                    y_diff = wcss.get(k-1, wcss[k]) - wcss[k]
                    radians = atan2(x_diff, y_diff*100)
                    angle = (radians * 180) / pi

                    if self.knee_angle < angle < 90:
                        log.info(f"Stopped at k={k} (angle: {'{:.02f}'.format(angle)}º).")
                        return km

        # Return maximum k-value if validator is WCSS (knee/angle)
        km = km if alg == "wcss" else max(kms, key=lambda key: kms[key])
        log.info(f"Optimal number of clusters set as k={km.n_clusters}.")
        return km

    def fit(self, X, y=None):
        """
        Executes K-Means clustering algorithm on vectorized text.
        """
        self.kmeans_ = (
            self.KMeans(self.n_clusters).fit(X)\
            if isinstance(self.n_clusters, int)\
            else self.KMeansOptimal(X, self.n_clusters, self.validator)
        )
        return self

    def fit_predict(self, X, y=None):
        """ Returns clusters as predicted by K-Means. """
        return self.fit(X).kmeans_.predict(X)

    def fit_transform(self, X, y=None):
        """ Returns data as transformed by K-Means. """
        return self.fit(X).kmeans_.transform(X)

    def predict(self, X, y=None):
        """ Returns clusters as predicted by K-Means. """
        return self.kmeans_.predict(X)

    def transform(self, X, y=None):
        """ Returns data as transformed by K-Means. """
        return self.kmeans_.transform(X)

    @staticmethod
    def silhouette(self, X, km):
        """
        Implementation of silhouette coefficients:
        https://doi.org/10.1016/0377-0427(87)90125-7
        """
        return silhouette_score(X, km.labels_,
                                metric=self.metric,
                                sample_size=self.sample_size)

    @staticmethod
    def gap(self, X, km):
        """
        Implementation of gap statistics:
        https://statweb.stanford.edu/~gwalther/gap
        """
        orig_disp = km.inertia_
        ref_disp = np.zeros(self.nrefs)

        for i in range(self.nrefs):
            r = np.random.random_sample(size=X.shape)
            ref_disp[i] = self.KMeans(km.n_clusters).fit(r).inertia_

        return np.log(np.mean(ref_disp)) - np.log(orig_disp)


class Clusterer(Pipeline):
    """
    Clusterer pipeline with support for Pandas data frames
    and series, hashing and text vectorizing (Tf-Idf) and
    optional truncated single value decomposition (LSA).
    """
    def __init__(
        self,
        clustering: Callable[[list], list] = KMeansCluster,
        use_pandas: bool = True,
        use_svd: bool = False,
        use_vect: bool = True,
        **kwargs,
    ):
        steps = []

        self.clustering = clustering
        self.use_pandas = use_pandas
        self.use_svd = use_svd
        self.use_vect = use_vect

        if self.use_pandas:
            steps.append(
                ("pandas",
                    PandasTransformer(**kwargs))
            )
        if self.use_vect:
            steps.append(
                ("vect",
                    TextVectorizer(**kwargs))
            )
        if self.use_svd:
            steps.append(
                ("lsa",
                    LSA(**kwargs))
            )
            if self.use_svd == "try":
                self.fit = self.__fit

        if self.clustering is not None:
            steps.append(
                ("cluster",
                    clustering(**kwargs)
                    if clustering == KMeansCluster
                    else clustering)
            )

        super().__init__(steps=steps)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def labels(self, name=None):
        if "cluster" not in self.named_steps.keys():
            raise TypeError(
                "This instance does not have an assigned clustering method in its pipeline.")

        cluster = self.named_steps.cluster.kmeans_\
                  if "kmeans_" in self.named_steps.cluster.__dict__.keys()\
                  else self.named_steps.cluster

        if "labels_" in cluster.__dict__.keys():
            return pd.Series(
                pd.to_numeric(
                    cluster.labels_,
                    downcast="integer",
                ),
                index=self.named_steps.pandas.index_ if "pandas" in self.named_steps else None,
                name=name,
            )

        raise NotFittedError(
            "This instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

    def nearest_features(self, max_items=10, unique=False):
        if "cluster" not in self.named_steps.keys():
            raise TypeError(
                "This instance does not have an assigned clustering method in its pipeline.")

        cluster = self.named_steps.cluster.kmeans_\
                  if "kmeans_" in self.named_steps.cluster.__dict__.keys()\
                  else self.named_steps.cluster

        if "vect" not in self.named_steps:
            raise TypeError(
                "This instance does not have an assigned vectorizer method in its pipeline.")

        if self.named_steps.vect.analyzer != "word": # ("char", "char_wb")
            raise NotImplementedError(
                f"Method not implemented for vectorized n-grams (analyzer='{self.named_steps.vect.analyzer}').")

        feature_names = self.named_steps.vect["tfidf"].get_feature_names()

        if "cluster_centers_" not in cluster.__dict__.keys():
            if "labels_" in cluster.__dict__.keys():
                raise NotImplementedError(
                    f"Method not implemented for assigned clustering method (cluster='{type(self.named_steps.cluster).__name__}').")
            raise NotFittedError(
                "This instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        order_centroids = (
            self.named_steps.lsa["svd"].inverse_transform(
                cluster.cluster_centers_
            )
            if "lsa" in self.named_steps.keys()
            and self.named_steps.lsa != "passthrough"
            else cluster.cluster_centers_
        ).argsort()[:, ::-1]

        k_nearest = {
            k: [feature_names[x] for x in order_centroids[k][:max_items]]
            for k in range(cluster.n_clusters)}

        if unique:
            counter = pd.Series(
                [f for features in k_nearest.values() for f in features])\
                .value_counts()
            k_nearest = {
                k: [f for f in features if counter[f] == 1]
                for k, features in k_nearest.items()}

        return k_nearest

    def top_features(self):
        return {
            k: features[0]
            for k, features in
            self.nearest_features(
                max_items=1,
                unique=False,
            ).items()
        }

    def __fit(self, X, y=None):
        try:
            return super().fit(X)
        except ValueError as e:
            # Recommended >= 100 features for Latent Semantic Analysis (SVD)
            if str(e).startswith("n_components must be < n_features"):
                log.warning(
                    f"LSA set as 'passthrough': {e}"
                )
                self.steps[
                    list(self.named_steps.keys()).index("lsa")
                ] = ("lsa", "passthrough")
                return super().fit(X)
            raise e

### Artifact generation

In [None]:
class Plot():

    def __init__(self):
        """ Initializes class. """

    @staticmethod
    def plot(
        data: Union[dict, list, pd.DataFrame, pd.Series],
        x: Union[str, int, None] = None,
        y: Union[str, int, None] = None,
        graph: str = "scatter",
        layout: str = "layout",
        layout_opts: dict = {},
        name: dict = {},
        size: dict = {},
        text: Union[dict, list] = None,
        resizer: Callable[[float], float] = lambda x: x,
        **opts,
    )-> go.Figure:
        """
        Returns a Plotly graph object figure from a dictionary,
        a list of categories or a Pandas data frame or series.
        """
        layout = getattr(Plot, layout)(**layout_opts)

        if not y and isinstance(data, pd.DataFrame):
            raise RuntimeError(
                "Missing required 'y' attribute for building Plotly figures from Pandas.DataFrame objects.")

        return go.Figure(
            data=[
                getattr(Plot, graph)(
                    x=list(trace.keys())
                    if isinstance(trace, dict)
                    else trace[x].values
                    if x and isinstance(trace, pd.DataFrame)
                    else trace.index,
                    y=list(trace.values())
                    if isinstance(trace, dict)
                    else trace[y].values
                    if isinstance(trace, pd.DataFrame)
                    else trace.values,
                    name=name.get(index, index),
                    size=resizer(size.get(index, MARKER_SIZE)),
                    text=text.get(index, "") if isinstance(text, dict) else text,
                    **opts,
                )
                for index, trace in (
                    data.items()
                    if len(data)
                    and isinstance(data, dict)
                    and type(list(data.values())[0])
                    in (dict, pd.DataFrame, pd.Series)
                    else enumerate(data)
                    if isinstance(data, list)
                    else [(None, data)]
                )
            ],
            layout=layout,
        )

    @staticmethod
    def subplots(
        data: dict,
        graph: str = "scatter",
        height: int = 768,
        orient: str = "ver",
        layout: str = "layout",
        layout_opts: dict = {},
        rows: int = None,
        cols: int = None,
        title: str = None,
        **opts,
    ) -> go.Figure:
        """ Returns a Plotly figure with subplots. """
        cursor = [0, 0]

        if orient not in ("ver", "hor"):
            return ValueError(
                f"Received invalid orient parameter: {orient}. Available choices: ('hor', 'ver')."
            )
        pointer = (0 if orient == "ver" else 1)
        cursor[(1 if orient == "ver" else 0)] += 1

        if cols is None:
            cols = (len(data)/(rows or len(data))) or 1
            cols = int(cols) + (1 if float(cols) != int(cols) else 0)

        if rows is None:
            rows = (len(data)/cols) or 1
            rows = int(rows) + (1 if float(rows) != int(rows) else 0)
        limit = (rows if orient == "ver" else cols)

        fig = make_subplots(rows=rows, cols=cols)
        for key, trace in reversed(data.items()):
            cursor[pointer] += 1
            fig.append_trace(
                getattr(Plot, graph)(
                    list(trace.keys()),
                    list(trace.values()),
                    name=key,
                    **opts,
                ),
                row=cursor[0],
                col=cursor[1],
            )
            if cursor[pointer] == limit:
                cursor[pointer] = 0
                cursor[pointer-1] += 1

        fig.update_layout({"height": height, **layout_opts})
        return fig

    @staticmethod
    def bar(x, y, **opts):
        """
        Returns Plotly 2-dimensional scatter.

        Input parameters:
            * x: list of values for horizontal axis
            * y: list of values for vertical axis
            * name: to include in point information
            * text: to include in trace information
        """
        return go.Bar(
            x=x,
            y=y,
            name=opts.get("name", ""),
            text=opts.get("text", ""),
            textfont=dict(
                family=FONT_FAMILY
            ),
        )

    @staticmethod
    def layout(title="", x_title="", y_title="", **opts):
        """
        Returns Plotly.Figure() layout dictionary.

        Input parameters:
            * title: Plotly figure title
            * x_title: horizontal axis title
            * y_title: vertical axis title
        """
        return go.Layout(
            xaxis=dict(
                autorange=AUTORANGE,
                title=x_title,
                ),
            yaxis=dict(
                autorange=AUTORANGE,
                title=y_title,
                ),
            legend=dict(
                y=LEGEND_Y,
                font=dict(
                    family=FONT_FAMILY,
                    size=FONT_SIZE,
                    color=FONT_COLOR,
                    ),
                ),
            title=title,
            **opts,
        )

    @staticmethod
    def scatter(x, y, mode="lines+markers", size=None, name="", text="", **opts):
        """
        Returns Plotly 2-dimensional scatter.

        Input parameters:
            * x: list of values for horizontal axis
            * y: list of values for vertical axis
            * mode: "lines", "markers" or both (default)
            * name: to include in point information
            * text: to include in trace information
        """
        return go.Scatter(
            x=x,
            y=y,
            name=name,
            text=text,
            mode=mode,
            connectgaps=CONNECT_GAPS,
            textposition=TEXT_POSITION,
            textfont=dict(
                family=FONT_FAMILY,
                ),
            marker=dict(
                size=size if size is not None else MARKER_SIZE,
                ),
            **opts,
            )


class Artifacts():

    @abstractmethod
    def __init__(self, **kwargs):
        """ Abstract initializer. """

    @staticmethod
    def attention_artifact(dates: pd.Series) -> dict:
        artifact = dates\
            .value_counts()\
            .sort_index()\
            .to_dict()
        return {
            "attention":
                Plot.plot(
                    artifact,
                    fill="tonexty",
                    layout_opts=dict(
                        title="Attention over time",
                        x_title="Date",
                        y_title="Hits",
                    ),
                ),
            "attention-data":
                artifact,
        }

    """
    @staticmethod
    def attention_ids_artifact(dates: pd.Series) -> dict:
        return {
            "attention-ids":
                dates\
                .groupby(dates)\
                .apply(lambda x: x.index.tolist())\
                .to_dict(),
        }
    """
    
    @staticmethod
    def themes_artifact(
        dates: pd.Series,
        clusters: pd.Series,
        clusters_top_features: dict,
    ) -> dict:
        artifact = {
            clusters_top_features.get(key, f"Theme #{key+1}"):
                values
            for key, values in
                clusters
                .groupby([dates, clusters])
                .size()
                .unstack()
                .fillna(0)
                .to_dict()
                .items()
        }

        return {
            "themes":
                Plot.plot(
                    artifact,
                    mode="lines",
                    stackgroup="one", # "relative",
                    layout_opts=dict(
                        title="Theme attention over time",
                        x_title="Date",
                        y_title="Hits",
                    ),
                ),
            "themes-data":
                artifact,
        }

    @staticmethod
    def themes_ids_artifact(
        dates: pd.Series,
        clusters: pd.Series,
        clusters_top_features: dict,
    ) -> dict:
        return {
            "themes-ids": {
                clusters_top_features.get(key, f"Theme #{key+1}"):
                    clusters[clusters == key]
                    .groupby(dates)
                    .apply(lambda x: x.index.tolist())
                    .to_dict()
                for key in
                    sorted(clusters.unique())
            },
        }

    @staticmethod
    def top_words_artifact(
        clusters_top_words: dict,
    ) -> dict:
        return {
            "top-words": {
                " ".join(series.index[:3]):
                    series.to_dict()
                for cluster, series in
                    clusters_top_words.items()
            },
        }


class ArtifactGenerator(Artifacts):

    def __init__(
        self,
        func_cluster: Callable[[pd.DataFrame], pd.Series],
        **kwargs,
    ) -> None:
        self.cluster_ = func_cluster
        self.stemmer_ = Stemmer(**kwargs).transform
        self.tokenizer_ = Tokenizer(**kwargs).transform

    def __call__(
        self,
        df: pd.DataFrame,
        artifacts: list = None,
        attr_date: str = "timestamp",
        attr_index: str = "id",
        attr_lang: str = "language",
        attr_text: Union[str, list] = "text",
        datetime_format: str = None,
        datetime_unit: str = None,
        include_keywords: bool = True,
        n_grams: int = 2,
        sort_by: Union[list, None] = ["like_count", "repost_count"],
    ) -> dict:
        t0 = time()

        if (sort_by and any(x in df.columns for x in sort_by)):
            df = df\
                .fillna(
                    {x: 0 for x in sort_by if x in df.columns}
                )\
                .loc[
                    df[[x for x in sort_by if x in df.columns]]
                    .sum(axis=1)
                    .sort_values(ascending=False)
                    .index
                ]

        index = pd.Series(
            df.loc[:, attr_index].values
            if attr_index and attr_index in df.columns\
            else df.index
        )
        df.index = range(df.shape[0])

        dates = (pd
            .to_datetime(
                df.loc[:, attr_date],
                format=datetime_format,
                infer_datetime_format=False if datetime_format or datetime_unit else True,
                unit=datetime_unit,
            ).apply(
                lambda x: x.strftime("%Y-%m-%d")
            )
        ) if attr_date in df.columns else pd.Series(name=attr_date)

        text = (pd
            .Series(
                self.tokenizer_(
                    df.loc[:, attr_text].apply(lambda x: x if x else "")
                    if isinstance(attr_text, str) else [
                        "\n".join(
                            x for x in x if isinstance(x, str)
                        ) for x in zip(
                            *[df[attr] for attr in attr_text if attr in df.columns]
                        )
                    ]
                ),
                index=df.index,
            )
        ) if attr_text in df.columns else pd.Series(name=attr_text)

        print(
            "Read %s total or %s unique vectors." % (
            text.shape[0],
            text.drop_duplicates().dropna().shape[0],
            )
        )

        clusters = pd.Series(
            self.cluster_(
                self.stemmer_(text, df[attr_lang])
                if attr_lang in df.columns
                else text
            ) if text.dropna().shape[0] else [],
            index=df.index,
        )

        clusters_top_words = self.__get_features(
            text.loc[clusters.index],
            clusters,
        )

        clusters_top_features = self.__get_features(
            text.loc[clusters.index],
            clusters,
            n_grams=n_grams,
            top_features=True,
            tfidf=True,
        ) if include_keywords else {}

        local = locals()
        artifacts_dict = {}

        for artifact in (artifacts or [x for x in Artifacts.__dict__.keys() if not x.startswith("_")]):
            func = getattr(self, artifact)
            print(artifact)
            artifacts_dict.update(
                func(*[local.get(x) for x in signature(func).parameters if x in local])
            )
        print(f"Finished in {time()-t0:.3f}s.")

        return {k: REINDEX_MAP_FUNC(v, index)
                if k.endswith("-ids") else v
                for k, v in artifacts_dict.items()}

    @staticmethod
    def __get_features(
        series: pd.Series,
        groupby: pd.Series = None,
        map_func: Callable[[str], str] = LABEL_MAP_FUNC,
        ignore_startswith: list = ["@","#"],
        n_grams: int = 1,
        normalized: bool = False,
        tfidf: bool = False,
        top_features: bool = False,
    ) -> dict:

        if not series.shape[0]:
            return dict()

        groups = series\
            .groupby(groupby if groupby is not None else [0] * series.shape[0])\
            .apply(lambda x: list(set(x.index.tolist())))

        features = {
            group:
                series
                .loc[groups[group]]
                .astype(str)
                .apply(lambda x: [x for x in x.split() if not any(x.startswith(char) for char in ignore_startswith)])
                .apply(lambda x: x if n_grams == 1 else Tokenizer.ngrams(x, n_grams))
                .apply(lambda x: list(set(x)) if tfidf else x)
                .explode()
                .dropna()
                .value_counts()
            for group in groups.index
        }

        if tfidf: # tf(t,d)/log(N/df(t))
            N = series.shape[0]
            df = reduce(lambda x, y: x.add(y, fill_value=0), features.values())
            features = {
                group:
                    (tf/tf.max())
                    .divide(
                        (df.loc[tf.index]/df.loc[tf.index].max())
                        .apply(lambda x: log10(N/(x+1)))
                    )
                    .sort_values(ascending=False)
                for group, tf in features.items()
            }

        if normalized:
            features = {
                group:
                    series.apply(lambda x: x/x.max(), axis=0)\
                for group, series in features.items()
            }

        if top_features:
            features = pd\
                .Series({
                    group:
                        series.index[:(len(groups)+1)]
                    for group, series in features.items()})\
                .explode()\
                .drop_duplicates(keep="first")
            features = features\
                [~features.index.duplicated()]\
                .dropna()\
                .map(map_func)\
                .to_dict()

        return features if groupby is not None else features[0]

    def __find_all(x, regexp) -> list:
        """
        Mentions are preceeded by an @-sign and may include
        letters, numbers and underscores to a max. of 30 chars.
        """
        found = findall(regexp, x.lower()) if isinstance(x, str) else []
        return [x for x in found if len(x)>1]


def plot_artifacts(artifacts):
    """ Plot all possible artifacts. """
    for name, artifact in artifacts.items():
        try:
            py.iplot(artifact)
        except:
            try: [py.iplot(data) for data in artifact.values()]
            except: pass # print("Skipping %s..." % name)


def write_artifacts(df, artifacts, output_folder="artifacts", indent=2):
    """ Store artifacts in a readable or presentable format. """
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder) 

    for name, artifact in artifacts.items():

        if isinstance(artifact, Figure):
            artifact.write_image(os.path.join(output_folder, f"{name}.png"))
            artifact.write_html(os.path.join(output_folder, f"{name}.html"))
            artifact.write_json(file=os.path.join(output_folder, f"{name}.json"), pretty=True)

        elif name.endswith("-ids"):
            for k, v in artifact.items():
                subfolder = "%s/%s" % (output_folder, name.replace("-ids", ""))

                if not os.path.isdir(subfolder):
                    os.makedirs(subfolder)

                if isinstance(v, dict):
                    df.loc[list(set([i for i in v.values() for i in i]))]\
                      .to_csv("%s/%s.csv" % (subfolder, k))
                else:
                    df.loc[list(set([i for i in v]))]\
                      .to_csv("%s/%s.csv" % (subfolder, k))

        else:
            for key, item in artifact.items():
                if isinstance(item, Figure):
                    item.write_image(os.path.join(output_folder, f"{name}-{key}.png"))
                    item.write_html(os.path.join(output_folder, f"{name}-{key}.html"))
                    item.write_json(file=os.path.join(output_folder, f"{name}-{key}.json"), pretty=True)
                else:
                    pd.Series(item).to_csv(f"{output_folder}/{name}-{key}.csv")

### PyPhi wrapper

In [None]:
class PyPhiNetwork():

    def __init__(self):
        """ Initializes class. """

    def __call__(
        self, 
        matrix,
        A=[],
        states=[],
        current_state=[],
        num_states=None,
        output_folder="artifacts",
    ) -> list:
        """
        Calculates phi and builds PyPhi network.

        Input parameters:
            * matrix: stochastic matrix containing transition values
                as a dictionary of keys or an array of arrays
            * A: adjacency matrix for connecting states
            * states: list of states (strings) to use as labels
            * current_state: observed states in the system
            * num_states: maximum number of observed states
        """
        if isinstance(matrix, dict):
            if not states:
                states = list(matrix.keys())[:num_states]
            matrix = list(matrix.values())[:num_states]

        if num_states:
            matrix = matrix[:num_states]
            A = [list(x[:num_states]) for x in A[:num_states]]
            states = states[:num_states]

        clusters = len(matrix).__str__()
        timeslices = len(matrix[0]).__str__()

        # state labels
        if not states:
            states = [str(i) for i in range(len(Q))]

        # most recent state
        if not current_state:
            for x in matrix:
                current_state.append(x[-1])

        # connectivity matrix
        if A == []:
            for i in range(len(matrix)):
                A.append([])
                for j in range(len(matrix)):
                    A[i].append(1)

        # connected subgraphs
        sg = self.get_connected_states(A)

        # influence matrix based on state transitions
        im = self.generate_influence_matrix(matrix, A)

        # transition probability matrix
        tpm = self.generate_tpm(matrix, im)

        # generate network with PyPhi
        network = pyphi.network.Network(tpm, cm=A)

        # generate ALL COMPLEXES structures of the network
        # may take a lot of time depending on the number of clusters
        ac = pyphi.compute.network.all_complexes(network, current_state)

        # sort complexes by phi value and select
        # the complex with the greatest value
        ac.sort(key=self.get_phi, reverse=True)
        bc = ac[0] # <-- best complex

        # best complex mechanisms and phi values
        ces = bc.ces

        # store PyPhi results as output string
        output = str("Clusters: "+str(clusters)+
                    "\nTimeslices: "+str(timeslices)+
                    "\n\nStates:\n"+str(states)+
                    "\n\nPresence Matrix:\n"+str(np.array(matrix))+
                    "\n\nCurrent state:\n"+str(current_state)+
                    "\n\nAdjacency Matrix:\n"+str(np.array(A))+
                    "\n\nConnected states:\n"+str(sg)+
                    "\n\nInfluence Matrix:\n"+str(np.array(im)).replace("None","-")+
                    "\n\nTransition Probability Matrix:\n"+str(np.array(tpm))+
                    "\n\nBest complex mechanisms and phi values:")

        for i in range(len(ces)):
            output += str("\n( %s ) Mechanism = %s φ = %s" % (i, list(ces[i].mechanism), ces[i].phi))

        # write to output file
        with open(f"{output_folder}/pyphi.txt", "w") as f:
            f.write(output)

        # print to standard output
        print(output)

        # write base view JSON file for PyPhi
        # self.create_view_base(bc, states, current_state, A)

        return bc, ac

    def complex_mechanism(self, c, m=None, output_folder="artifacts"):
        """
        Displays complex (c) cause-effect structure from
        mechanism (m) in a slightly optimized plotly view.
        """
        c = c.ces # cause effect-structure
        
        if m is None:
            c = sorted(c, key=lambda x: x.phi, reverse=True)
            m = 0

        c = c[m]  # complex mechanism
        v = ["cause","effect"]

        for i in v:
            # cause and effect
            print("\n---\n"+i.capitalize()+":\n---\n")

            value = getattr(c,i)
            matrix = len(list(value.purview))
            x = []
            y = []

            #print(value)
            print("\nMaximally-irreducible", i, "φ =", value.phi)
            print("\nMechanism:", value.mechanism)
            print("Purview:", value.purview)
            print("\nMIP:\n"+ str(value.mip))
            print("\nRepertoire:")

            purviewSize = len(list(value.purview))
            array = list(value.repertoire.reshape(1,pow(2,purviewSize))[0])

            for j in range(len(array)):
                binArray = self.create_bin_array(j,purviewSize)
                binArrayString = "".join(map(str, binArray))
                x.append("state-" + binArrayString)
                y.append(array[j])

            data = [go.Bar(
                x=x,
                y=y
                )]

            layout = go.Layout(
                autosize=False,
                width=800,
                height=300,
                title=f"Repertoire: Maximally-irreducible {i} φ = {value.phi} (Mechanism: {value.mechanism})",
                #yaxis=dict(8
                #range=[0,1])
                )

            fig = go.Figure(data=data, layout=layout)
            fig.write_html(f"{output_folder}/pyphi-{i}.html")
            fig.write_image(f"{output_folder}/pyphi-{i}.png")
            fig.show(renderer="colab")
            # py.iplot(fig)
    
    def create_view_base(self, complex, states, current_state, A):
        '''
        Writes output JSON file for PyPhi view.

        Input parameters:
            * complex: to get subsystems
            * states: as a list of strings
            * current_state: in current tX
            * A: adjacency matrix
        '''
        subsystem = complex.subsystem.__str__()[10:-1].replace(" ","").split(',')
        nodes = self.get_graph_nodes(states, subsystem, current_state)
        edges = self.get_graph_edges(states, A)

        complexJSON = {

            "states": states,

            "graph": {"nodes": nodes,
                      "edges": edges},

            'sia': {'bigPhi': complex.phi,
                    'subsystem': complex.subsystem.__str__(),
                    'ces': []}}

        ces = complex.ces

        for i in ces:
            obj = self.get_CES_JSON_object(i)
            complexJSON["sia"]["ces"].append(obj)

        with open('view.json', 'w') as j:
            json.dump(complexJSON, j)

    def distance(self, o1, o2):
        """
        Returns centroid distance.
        """
        return 1 - self.semantic_similarity_number(o1, o2)

    def filter_connected_states(self, P, A=[], states_to_remove=[]):
        """
        Returns a subset of connected states in matrices.
        """
        if not states_to_remove:
            return (P, A)

        # get list of states to remove
        if isinstance(states_to_remove, str):
            states_to_remove = states_to_remove.split(",")

            try:
                states_to_remove = [int(x) for x in states_to_remove]
            except ValueError:
                if isinstance(P, dict):
                    states_to_remove = [list(P.keys()).index(x) for x in states_to_remove]
                else:
                    raise TypeError("expected a dictionary to match list of states as keys")

        # check list of states to remove
        if not isinstance(states_to_remove, list):
            raise TypeError("expected a list of states (indices or dictionary keys) as input")

        keys = list(P.keys())
        P = {keys[i]:value for i, value in enumerate(P.values()) if i not in states_to_remove}
        return (P, self.filter_matrix(A, states_to_remove) if A else [])

    def filter_matrix(self, matrix, states_to_remove=[]):
        """
        Returns matrix without specified items to remove.
        """
        return [[v for k, v in enumerate(matrix[i]) if k not in states_to_remove] for i in range(len(matrix)) if i not in states_to_remove]

    def generate_distances_matrix(self, centroids, threshold="median"):
        """
        Returns adjacency matrix based on centroids distance (median/mean):
        * if higher than threshold, nodes will not be connected (0);
        * if lower or equal, nodes will be connected (1).
        Note that a centroid will always be nearest its own cluster.
        """
        n = len(centroids)
        matrix = np.zeros(shape=(n,n))
        values = []

        for i,_ in enumerate(centroids):
            for j in range(int(i), len(centroids)):
                d = self.distance(centroids[i], centroids[j])
                values.append(d)
                matrix[i][j] = d
                matrix[j][i] = d

        if threshold:
            adjacency = np.zeros(shape=(n,n), dtype=np.int8)
            values = sorted(values)

            if threshold == "median":
                threshold = np.median(values)
            elif threshold == "mean":
                threshold = np.mean(values)
            else: # error
                log.warning("THRESHOLD should be either 'mean' or 'median'.")
                return matrix

            for i,_ in enumerate(centroids):
                for j,_ in enumerate(centroids):
                    if matrix[i][j] <= threshold:
                        adjacency[i][j] = 1
                    else: adjacency[i][j] = 0

            return adjacency.tolist()

        return matrix

    def generate_influence_matrix(self, M, cm=[], normalized=True):
        """
        Returns matrix based on state transitions from each node to another:
        * influence equals the amount of hits a state has in t+1;
        * optionally considers only linked states in connectivity matrix (cm);
        * if normalized and NxN shaped, the returned matrix will be stochastic.
        """
        matrix = []

        for group in range(len(M)):
            matrix.append([])
            for target in range(len(M)):
                hits = 0
                if cm == [] or cm[group][target] == 1:
                    for i in range(1, len(M[group])):
                        if M[group][i] == M[target][i-1]:
                            hits += 1
                matrix[group].append(hits)

        if normalized:
            for i, array in enumerate(matrix):
                s = sum([(0 if x==None else x) for x in array])
                a = [round((0 if x in (0,None) else x/s), 2) for x in array]
                # sum of each row must amount 1.0 (unity)
                a[a.index(max(a))] = round(max(a) + (1-sum(a)), 100)
                matrix[i] = a

        matrix = np.array(matrix)
        matrix = np.around(matrix, 2)

        return matrix.tolist()

    def generate_presence_matrix(self, cluster_metrics, threshold="median"):
        """
        Returns matrix for each state based on presence value (median/mean):
        * if higher than threshold, state presence will be positive (1);
        * if lower or equal, state presence will be negative (0).
        """
        matrix = []
        values = []

        for i,m in enumerate(cluster_metrics):
            matrix.append([])
            keys = sorted(list(m.keys())[0:-1])

            for k in keys:
                metrics = json.loads(m[k])
                count = metrics["count"]
                matrix[i].append(count)
                values.append(count)

        if threshold:
            values = sorted(values)

            if threshold == "median":
                threshold = np.median(values)
            elif threshold == "mean":
                threshold = np.mean(values)
            else: # error
                log.warning("THRESHOLD should be either 'mean' or 'median'.")
                return matrix

            for i,_ in enumerate(matrix):
                for j,_ in enumerate(matrix[i]):
                    if matrix[i][j] > threshold:
                        matrix[i][j] = 1
                    else: matrix[i][j] = 0

        return matrix
    
    def generate_presence_matrix_from_themes(self,  themes, by="median", output_folder="artifacts"):
        """ Convert an artifact to a presence matrix composed of binary data. """
        os.makedirs(output_folder) if not os.path.isdir(output_folder) else None

        matrix = {}
        threshold = None

        dates = sorted(set(x for x in themes.keys() for x in themes[x]))

        for label, dct in themes.items():
            matrix[label] = [len(dct[date]) if dct.get(date) else 0 for date in dates]

            if by == "median":
                threshold = np.median(matrix[label])
            elif by == "mean":
                threshold = np.mean(matrix[label])
            else:
                log.warning(f"Unrecognized threshold method (by='{by}').")

            if threshold:
                matrix[label] = [1 if v >= threshold else 0 for v in matrix[label]]

        with open(f"{output_folder}/matrix.json", "w") as j:
            json.dump(matrix, j)

        return matrix

    def generate_tpm(self, matrix, influence_matrix):
        """
        Builds Transition Probability Matrix for PyPhi.

        Input parameters:
            * matrix: input matching influence_matrix
            * influence_matrix: must be stochastic
        """
        tpm = []
        times = len(matrix[0])
        number_of_groups = len(matrix)
        #divisor = number_of_groups*(times-1)

        for i in range(pow(2, number_of_groups)):
            tpm.append([])
            bin_array = self.create_bin_array(i, number_of_groups)

            for group in range(len(matrix)):
                value = self.get_probability(group, bin_array, influence_matrix, number_of_groups, times-1)
                tpm[i].append(value)

        return tpm

    def generate_users_matrix(self, clusters, obj_array, threshold="median"):
        """
        Returns adjacency matrix based on users in clusters (median/mean):
        * if higher than or equal to threshold, nodes will be connected (1);
        * if lower, nodes will not be connected (0).
        """
        n = len(clusters)
        matrix = np.zeros(shape=(n,n))
        users = {}

        for c in clusters.keys():
            for obj in clusters[c]:
                screen_name = obj["screen_name"]
                try: users[screen_name].append(c)
                except: users[screen_name] = [c]

        users = [set(u) for u in users.values()]

        for u in users:
            for c in u:
                matrix[c][c] += 1
            for i,j in combinations(u, 2):
                matrix[i][j] += 1
                matrix[j][i] += 1

        if threshold:
            adjacency = np.zeros(shape=(n,n), dtype=np.int8)
            values = []

            for i,x in enumerate(matrix):
                adjacency[i][i] = matrix[i][i]
                for j,v in enumerate(x):
                    values.append(v)

            values = sorted(values)

            if threshold == "median":
                threshold = np.median(values)
            elif threshold == "mean":
                threshold = np.mean(values)
            else: # error
                log.warning("THRESHOLD should be either 'mean' or 'median'.")
                return matrix

            for i,_ in enumerate(clusters):
                for j,_ in enumerate(clusters):
                    if i == j or matrix[i][j] >= threshold:
                        adjacency[i][j] = 1
                    else: adjacency[i][j] = 0

            return adjacency.tolist()

        return matrix

    def get_CES_JSON_object(self, ces):
        '''
        Returns cause-effect structure as a JSON object.
        '''
        cesJsonObject = {
            "smallphi": ces.phi,
            "mechanism": list(ces.mechanism),
            "cause":{
                "mip":ces.cause.mip.__str__(),
                "smallphi":ces.cause.phi,
                "purview":list(ces.cause.purview)},
            "effect":{
                "mip":ces.effect.mip.__str__(),
                "smallphi":ces.effect.phi,
                "purview":list(ces.cause.purview)}}
        # cause repertoire array
        n = len(list(ces.cause.purview))
        cesJsonObject['cause']['repertoire'] = self.get_repertoire_array(ces.cause,n)
        # effect repertoire array
        n = len(list(ces.effect.purview))
        cesJsonObject['effect']['repertoire'] = self.get_repertoire_array(ces.effect,n)
        # returns cause effect-structure
        return cesJsonObject

    def get_connected_states(self, A, states=[]):
        """
        Returns connected states or subgraphs
        based on the input adjacency matrix *A*.
        """
        dim = len(A)
        subgraphs = []
        free_nodes = []

        for i in range(dim):
            free_nodes.append(i)

        while len(free_nodes) > 0:
            i = 0
            subgraph = []
            pivot = free_nodes.pop(0)
            subgraph.append(pivot)

            while i < len(subgraph):
                j = 0
                source = subgraph[i]

                while j < len(free_nodes):
                    target = free_nodes[j]
                    hit = A[source][target]

                    if hit == 1:
                        subgraph.append(free_nodes.pop(j))
                    else: j+=1

                i+=1

            subgraphs.append(subgraph)

        if states:
            return states if not subgraphs else [[states[x] for x in s] for s in subgraphs]

        return subgraphs

    def get_repertoire_array(self, ces, n):
        '''
        Return repertoire list.

        Input parameters:
            * ces: cause-effect structure
            * n: number of digits
        '''
        repertoire = []
        array = list(ces.repertoire.reshape(1,pow(2,n))[0])
        # append key and value to list
        for i in range(len(array)):
            repertoire.append({
                "key": self.create_bin_array(i,n).__str__().replace("[","").replace(",","").replace("]","").replace(" ",""),
                "value": array[i]})
        # return repertoire list
        return repertoire

    def get_phi(self, obj):
        """
        Sort all complexes in *obj* by their phi values.
        """
        return obj.phi
    
    def get_probability(self, group, bin_array, influence_matrix, number_of_groups, max_influence):
        """
        Returns probability of transition for
        each group based on influence matrix.

        Input parameters:
            * group: in influence matrix
            * bin_array: binary array
            * influence_matrix: must be stochastic
            * number_of_groups: to consider for P
            * max_influence: to consider for P
        """
        total = 0
        none_count = 0

        influence_list = influence_matrix[group]

        for i in range(len(influence_list)):
            if influence_list[i] == None:
                none_count += 1
            else:
                if bin_array[i] == 1:
                    total += influence_list[i]
                else:
                    total += (max_influence - influence_list[i])

        try: return total/((number_of_groups-none_count)*(max_influence if max_influence>0 else 1))
        except: return 0

    def load_json_matrix(self, input_json):
        """
        Loads *input_json* file containing data for PyPhi.
        Accepted formats from this file are listed below:

        * matrix and connectivity matrix from dictionary:
            {"matrix": <dictionary of keys or array of arrays>,
            "A": <array of arrays>}

        * matrix as a dictionary of keys:
            {"state1": [0,1,0,0,1,0],
            "state2": [1,0,1,1,0,0],
            "state3": [0,1,0,1,0,1]}

        * matrix as an array of arrays:
            [[0,1,0,0,1,0],
            [1,0,1,1,0,0],
            [0,1,0,1,0,1]]

        * connectivity matrix as an array of arrays:
            [[1,1,1],
            [1,1,1],
            [1,1,1]]
        """
        with open(input_json, "r") as j:
            data = json.loads(j.read())

        if "matrix" in data:
            matrix = data["matrix"]
            A = data["A"] if "A" in data else (data["cm"] if "cm" in data else [])
        else:
            matrix = self.generate_presence_matrix(data)
            A = []

        return matrix, A

    def mc_render(self, matrix, states=[], states_to_remove=[], renderer="graphviz", ext="png", prog="dot", output_folder="."):
        """
        Builds a state-transition diagram or markov chain.
        Output files written: "markov.dot" and "markov.png".

        Input parameters:
            * matrix: stochastic matrix containing transition values
                as a dictionary of keys or an array of arrays
            * states: list of states (strings) to use as labels
            * renderer: choose "graphviz"; "pygraphviz"; or "pydot"
            * ext: image file type format to write as (default: PNG)
            * prog: others may be available by graphviz e.g. "neato"
            * output_folder: output folder name to write files to
        """
        edge_labels = {}

        output_dot = f"{output_folder}/markov.dot"
        output_ext = f"{output_folder}/markov." + ext

        # filter matrix states by user list
        if states_to_remove:
            matrix = self.mc_filter(matrix, states_to_remove, True)

        # matrix as a dictionary of keys
        if isinstance(matrix, dict):
            if not states:
                states = list(matrix.keys())
            matrix = list(matrix.values())

        # prepare graph object
        if renderer == "graphviz":
            G = gv.Digraph(format=ext)
            G.attr("node", shape="circle")
            G.attr(rankdir="LR", size="8")
            G.add_node = G.node
            G.add_edge = G.edge

        elif renderer == "pygraphviz":
            G = pgv.AGraph(format=ext, strict=False, directed=True)

        elif renderer == "pydot":
            G = nx.MultiDiGraph()

        elif renderer == "none":
            pass

        else: # exit
            raise ValueError(f"Unrecognized renderer: '{renderer}'.\n"+
                             f"Available choices: 'graphviz', 'pygraphviz', 'pydot', 'none'.")

        # state labels
        if not states:
            states = [str(i) for i in range(len(matrix))]

        # add state nodes
        for s in states:
            G.add_node(s)

        # build graph network
        for i, origin_state in enumerate(states):
        # for i in range(len(matrix)):
            # origin_state = states[i]
            for j, destination_state in enumerate(states):
            # for j in range(i+1, len(matrix)):
                # destination_state = states[j]
                rate = matrix[i][j] # im[i][j]
                if (rate != None and rate > 0):
                    str_rate = str("{:.02f}".format(rate))
                    G.add_edge(origin_state, # states[i],
                            destination_state, # states[j],
                            weight=rate if renderer != "graphviz" else None,
                            label=str_rate)
                    edge_labels[(origin_state, destination_state)] = label=str_rate

        # create output path directory
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)

        if renderer == "graphviz":
            G.render(f"{output_folder}/markov")
            move(f"{output_folder}/markov", output_dot)
            return gv.Source.from_file(output_dot)

        elif renderer == "pygraphviz":
            G.layout(prog=prog)
            G.draw(output_ext, format=ext, prog=prog)
            return G.draw(format=ext, prog=prog)

        elif renderer == "pydot":
            plt.axis("off")
            plt.tight_layout()
            pos = nx.nx_pydot.pydot_layout(G, prog=prog)
            nx.drawing.nx_pydot.write_dot(G, output_dot)
            nx.draw_networkx_nodes(G, pos, node_shape="s", node_color="w", node_size=1000)
            nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)
            nx.draw_networkx_labels(G, pos, font_weight=2)
            nx.draw_networkx_edge_labels(G, pos, edge_labels)
            plt.savefig(output_ext, format=ext)
            return plt.plot()

        return G

    def mc_filter(self, matrix, states_to_remove=[], normalized=False):
        """
        Returns an optionally normalized matrix with selected rows.
        """
        if not states_to_remove:
            return matrix

        # get list of states to remove
        if isinstance(states_to_remove, str):
            states_to_remove = states_to_remove.split(",")

            try:
                states_to_remove = [int(x) for x in states_to_remove]
            except ValueError:
                if isinstance(matrix, dict):
                    states_to_remove = [list(matrix.keys()).index(x) for x in states_to_remove]
                else:
                    raise TypeError("expected a dictionary to match list of states as keys")

        # check list of states to remove
        if not isinstance(states_to_remove, list):
            raise TypeError("expected a list of states (indices or dictionary keys) as input")

        # get matrix values
        array = matrix
        if isinstance(matrix, dict):
            array = list(matrix.values())

        # filter matrix values
        output = self.normalize_matrix(self.filter_matrix(array, states_to_remove))\
                 if normalized else self.filter_matrix(array, states_to_remove)

        if isinstance(matrix, dict):
            keys = [list(matrix.keys())[i] for i in range(len(matrix.keys())) if i not in states_to_remove]
            return {keys[i]: output[i] for i in range(len(output))}

        return output

    @staticmethod
    def create_bin_array(number, digits):
        """
        Converts a number in a binary array with n digits, as in:
            create_bin_array(10,5) -> 01010
        """
        array = []
        binString = format(number,"b")

        for i in range(digits-len(binString)):
            array.append(0)

        for i in range(len(binString)):
            array.append(int(binString[i]))

        return array

    @staticmethod
    def get_graph_nodes(states, subsystem, state):
        '''
        Returns nodes in graph.

        Input parameters:
            * states: as a list of strings
            * subsystem: from complex
            * state: from subsystem
        '''
        nodes = []

        for i in range(len(states)):
            status = ""

            if states[i] in subsystem:
                status = "on" if (state[i] == 1) else "off"
            else:
                status = "undefined"

            nodes.append({
                "id":states[i],
                "status": status})

        return nodes

    @staticmethod
    def get_graph_edges(states, A):
        '''
        Returns edges from connected nodes.

        Input parameters:
            * states: as a list of strings
            * A: adjacency matrix
        '''
        edges = []
        for i in range(len(A)):
            for j in range(i+1,len(A)):
                if A[i][j] == 1:
                    edges.append({
                        "source":states[i],
                        "target":states[j]})

        return edges

    @staticmethod
    def normalize_matrix(matrix):
        """
        Returns normalized matrix in which the
        sum of each row amounts 1.0 (unity).
        """
        for i, array in enumerate(matrix):
            s = sum([(0 if x==None else x) for x in array])
            a = [round((0 if x in (0,None) else x/s), 2) for x in array]
            a[a.index(max(a))] = round(max(a) + (1-sum(a)), 100)
            a = np.around(a, 2)
            matrix[i] = a
        if any(np.around(sum(row)) != 1 for row in matrix):
            raise ValueError("failed to normalize matrix after filtering (row does not amount unity).")
        return matrix
    
    @staticmethod
    def semantic_similarity_number(o1, o2):
        """
        Returns semantic similarity distance value.
        """
        total = 0
        keys  = set(list(o1["word_dict"].keys())+list(o2["word_dict"].keys()))

        for key in keys:

            try: m1 = o1["word_dict"][key]
            except: m1 = 0

            try: m2 = o2["word_dict"][key]
            except: m2 = 0

            total = total + min(m1, m2)

        div = max(o1["total"], o2["total"])

        return (total/div) if div>0 else 0

## Analyze

### Load dataset

Expects data indexed as columns `author_username`, `id`, `language`, `like_count`, `repost_count`, `text` and `timestamp`.



In [None]:
MAX_ROWS = None

columns = {
    "": "author_username",
    "": "id",
    "": "language", 
    "": "like_count",
    "": "repost_count",
    "": "text",
    "": "timestamp"
}
df = pd.concat([
    pd.read_csv("filename.csv", low_memory=False, usecols=list(columns.keys())),
])
df.columns = [columns.get(x, x) for x in df.columns]
df.index = df["id"]
print(f"Loaded {df.shape} objects.")

### Detect clusters i.e. themes

In [None]:
!rm -fr artifacts
artifacts = ArtifactGenerator(
    func_cluster=Clusterer(
        analyzer="char",
        max_paragraphs=10,
        random_state=0,
        use_pandas=False,
        use_svd="try",
    ).fit_predict,
    stop_words=CUSTOM_STOPWORDS,
).__call__(
    df,
    datetime_unit="s",
    include_keywords=True,
    n_grams=2,
)

#### Plot artifacts

In [None]:
plot_artifacts(artifacts)

#### Save artifacts

In [None]:
write_artifacts(df, artifacts)

### Execute PyPhi

In [None]:
pphi = PyPhiNetwork()

#### Build matrix from themes

Set presence matrix (`P`) for a list of states over time, containing binary elements (`0` for not active; `1` for active).

In [None]:
P = pphi.generate_presence_matrix_from_themes(artifacts["themes-ids"]); P

###### Filter connected states (optional)

Optionally get a subset of connected states in a matrix, from a list optionally set by the user (comma separated).

In [None]:
states_to_remove = "" # <-- comma separated

P, A = pphi.filter_connected_states(P, A=[], states_to_remove=states_to_remove)

###### Check connected states (optional)

Returns a list of connected states based on presence (`P`) and adjacency (`A`) matrices.

In [None]:
pphi.get_connected_states(A, states=list(P.keys()))

#### Render transition diagram

Render a transition diagram from a Markovian chain. Available renderers: `graphviz`, `pygraphviz` and `pydot`.

In [None]:
states_labels = "" # <-- comma separated

pphi.mc_render(
    pphi.generate_influence_matrix(list(P.values()), A),
    states=states_labels.split(",") if states_labels else P.keys(),
    output_folder="artifacts"
)

#### Cause-effect structure

Integrated information theory provides a mathematical framework to fully characterize the cause-effect structure of a physical system. [PyPhi](http://integratedinformationtheory.org) implements a framework for causal analysis and unfolds the full cause-effect structure of discrete dynamical systems of binary elements.

##### Compute network

Returns all complexes in the network context of all φ and Φ computation. Here we’ll use the 2-dimensional state-by-node form for the TPM.

In [None]:
bc, ac = pphi(P, A)

##### Mechanism details

See details (cause and effect) of one selected mechanism, based on the list above.

In [None]:
m = None # <-- mechanism number from above (optional)

pphi.complex_mechanism(bc, m)

##### Best complex data

Display best complex with the highest phi value after system irreducibility analysis and its cause-effect structure.

In [None]:
bc

##### ALL complexes data (!)

Display all complexes identified by PyPhi. **Warning:** long text buffer.

In [None]:
# ac

## Compress output →  `output.zip`

In [None]:
!zip -9r output.zip artifacts

_____

## References

* PyPhi: [website](https://pypi.org/project/pyphi/) | [arxiv](https://arxiv.org/abs/1712.09644) | [documentation](https://pyphi.readthedocs.io/en/latest/) | [GitHub](https://github.com/wmayner/pyphi)