In [3]:
import pandas as pd
import yaml
from mosaic_widget import MosaicWidget

weather = pd.read_csv(
    "https://uwdata.github.io/mosaic-datasets/data/seattle-weather.csv",
    parse_dates=["date"],
)

spec = {
    "params": {
        "click": {"select": "single"},
        "domain": ["sun", "fog", "drizzle", "rain", "snow"],
        "colors": ["#e7ba52", "#a7a7a7", "#aec7e8", "#1f77b4", "#9467bd"],
    },
    "vconcat": [
        {
            "hconcat": [
                {
                    "plot": [
                        {
                            "mark": "dot",
                            "data": {"from": "weather", "filterBy": "$click"},
                            "x": {"dateMonthDay": "date"},
                            "y": "temp_max",
                            "fill": "weather",
                            "r": "precipitation",
                            "opacity": 0.7,
                        },
                        {"select": "intervalX", "as": "$range"},
                        {"select": "highlight", "by": "$range", "fill": "#eee"},
                        {"legend": "color", "as": "$click", "columns": 1},
                    ],
                    "xyDomain": "Fixed",
                    "xTickFormat": "%b",
                    "colorDomain": "$domain",
                    "colorRange": "$colors",
                    "rDomain": "Fixed",
                    "rRange": [2, 10],
                    "width": 800,
                }
            ]
        },
        {
            "plot": [
                {
                    "mark": "barX",
                    "data": {"from": "weather"},
                    "x": {"count": None},
                    "y": "weather",
                    "fill": "#f5f5f5",
                },
                {
                    "mark": "barX",
                    "data": {"from": "weather", "filterBy": "$range"},
                    "x": {"count": None},
                    "y": "weather",
                    "fill": "weather",
                    "order": "weather",
                },
                {"select": "toggleY", "as": "$click"},
                {"select": "highlight", "by": "$click"},
            ],
            "xDomain": "Fixed",
            "yDomain": "$domain",
            "yLabel": None,
            "colorDomain": "$domain",
            "colorRange": "$colors",
            "width": 800,
        },
    ],
}


MosaicWidget(spec, data={"weather": weather})

MosaicWidget(spec={'params': {'click': {'select': 'single'}, 'domain': ['sun', 'fog', 'drizzle', 'rain', 'snow…

# Word, sentence, and document metrics


In [4]:
import pandas as pd

In [7]:
df_vast = pd.read_csv("../datasets/raw/vast2021.csv")
message_col = df_vast["message"]

In [15]:
message_col.iloc[150]

'Dr. Audrey McConnell Newman, internationally recognized environmental activist, traveled to Abila to speak for the POK rally'

In [12]:
import nltk

In [27]:
def create_col(func, name):
    _t = message_col.apply(func)
    table = _t.explode().reset_index()
    table.columns = ["id", name]

    return table


character_table = create_col(list, "character")
word_table = create_col(nltk.tokenize.word_tokenize, "word")
sentence_table = create_col(nltk.tokenize.sent_tokenize, "sentence")

In [45]:
character_table.to_parquet("character.parquet")
word_table.to_parquet("word.parquet")
sentence_table.to_parquet("sentence.parquet")

In [34]:
raw_df = message_col.reset_index()
raw_df.columns = ["id", "message"]

In [47]:
save_df = df_vast[["message", "type", "author", "date"]]

save_df

Unnamed: 0,message,type,author,date
0,Follow us @POK-Kronos,mbdata,POK,2014-01-23 17:00:00
1,Don't miss a moment! Follow our live coverage...,mbdata,maha_Homeland,2014-01-23 17:00:00
2,Come join us in the Park! Music tonight at Abi...,mbdata,Viktor-E,2014-01-23 17:00:00
3,POK rally to start in Abila City Park. POK lea...,mbdata,KronosStar,2014-01-23 17:00:00
4,POK rally set to take place in Abila City Park...,mbdata,AbilaPost,2014-01-23 17:00:00
...,...,...,...,...
4058,RT @AbilaPost unknown explosion heard from the...,mbdata,plasticParts,2014-01-23 21:33:10
4059,RT @CentralBulletin explosion heard at dancing...,mbdata,klingon4real,2014-01-23 21:33:45
4060,RT @KronosStar There has been an explosion fro...,mbdata,lindyT,2014-01-23 21:34:00
4061,RT @redisrad What was that? #boom,mbdata,dolls4sale,2014-01-23 21:34:00


In [49]:
sentence_table

Unnamed: 0,id,sentence
0,0,Follow us @POK-Kronos
1,1,Don't miss a moment!
2,1,Follow our live coverage of the POK Rally in t...
3,2,Come join us in the Park!
4,2,Music tonight at Abila City Park!
...,...,...
6788,4060,Several people are down.
6789,4060,#KronosStar #DancingDolphinFire #AFDHeroes
6790,4061,RT @redisrad What was that?
6791,4061,#boom


In [50]:
save_df = save_df.reset_index()
save_df = save_df.rename(columns={"index": "id"})

In [55]:
save_df["date"] = pd.to_datetime(save_df["date"])

In [57]:
save_df.to_parquet("mainTable.parquet")

In [43]:
from mosaic_widget import MosaicWidget


data_map = {
    # "characterTable": character_table,
    "wordTable": word_table,
    "sentenceTable": sentence_table,
    "mainTable": raw_df,
}

spec = {
    #   "params": {
    #     "click": { "select": "single" },
    #     "domain": ["sun", "fog", "drizzle", "rain", "snow"],
    #     "colors": ["#e7ba52", "#a7a7a7", "#aec7e8", "#1f77b4", "#9467bd"]
    #   },
    "vconcat": [
        {
            "plot": [
                {
                    "mark": "barX",
                    "data": {"from": "wordTable"},
                    "x": {"count": None},
                    "y": "word",
                    "order": "word",
                    "sort": {"y": "-x", "limit": 20},
                    "marginLeft": 50,
                },
                # {
                #   "mark": "barX",
                #   "data": { "from": "weather", "filterBy": "$range" },
                #   "x": { "count": None },
                #   "y": "weather",
                #   "fill": "weather",
                #   "order": "weather"
                # },
                # { "select": "toggleY", "as": "$click" },
                # { "select": "highlight", "by": "$click" }
            ],
        }
    ]
}


MosaicWidget(spec, data=data_map)

MosaicWidget(spec={'vconcat': [{'plot': [{'mark': 'barX', 'data': {'from': 'wordTable'}, 'x': {'count': None},…