In [3]:
from google.colab import files

# Variationist Quickstart

**Welcome! In this quickstart guide, you will be shown the basics of how Variationist works, and a sample of what you can do with it.**

🕵️‍♀️ Variationist is a highly-modular, flexible, and customizable tool to analyze and explore language variation and bias in written language data. It allows researchers, from NLP practitioners to linguists and social scientists, to seamlessly investigate language use across a wide range of use cases.



## Installing Variationist

You can install Variationist either using `pip` or by cloning our `github` repo.

In [1]:
# Install using pip

!pip install variationist

Collecting variationist
  Downloading variationist-0.1.0-py3-none-any.whl (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m51.2/57.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting altair==5.2.0 (from variationist)
  Downloading altair-5.2.0-py3-none-any.whl (996 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.9/996.9 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.17.1 (from variationist)
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji==2.10.1 (from variationist)
  Downloading emoji-2.10.1-py2.py3-

In [2]:
from variationist import Inspector, InspectorArgs, Visualizer, VisualizerArgs

Load your own dataset in .tsv or .csv format, or use a dataset that is available on HuggingFace


In [5]:
files.upload()

Saving netflix.tsv to netflix (1).tsv


In [16]:
my_dataset = "netflix.tsv"

In [20]:
# Define the inspector arguments
inspector_args = InspectorArgs(
	text_names=["title"], var_names=["type"], metrics=["pmi", "stats"],
	stopwords="en", n_tokens=1, n_cooc=1)

In [21]:
# Create an inspector instance, run it, and get the results in json
results = Inspector(dataset=my_dataset, args=inspector_args).inspect()

INFO: No values have been set for var_types. Defaults to nominal.
INFO: No values have been set for var_semantics. Defaults to general.
INFO: The metadata we will be using for the current analysis are:
{'text_names': ['title'], 'var_names': ['type'], 'metrics': ['pmi', 'stats'], 'var_types': ['nominal'], 'var_semantics': ['general'], 'var_subsets': None, 'var_bins': [0], 'tokenizer': 'whitespace', 'language': None, 'n_tokens': 1, 'n_cooc': 1, 'unique_cooc': False, 'cooc_window_size': 0, 'freq_cutoff': 3, 'stopwords': 'en', 'lowercase': False, 'ignore_null_var': False}
INFO: all column identifiers are treated as column names.
INFO: 'netflix.tsv' is loaded as a TSV file.
INFO: given the provided column names, we consider the first line as the header.
INFO: Tokenizing the title column...


100%|██████████| 2684/2684 [00:00<00:00, 21859.16it/s]


INFO: Currently calculating metric: 'pmi'


100%|██████████| 2/2 [00:00<00:00, 239.72it/s]

INFO: Currently calculating metric: 'stats'





In [27]:

# Define the visualizer arguments
visualizer_args = VisualizerArgs(
	output_folder="output", zoomable=True, ngrams=None, output_formats=["html", "png"])

# Create dynamic visualizations of the results
x = Visualizer(input_json=results, args=visualizer_args).visualize()

Reading json data...
INFO: Creating a BarChart object...
INFO: Saving it to the filepath: "output/pmi/BarChart.html".
INFO: Saving it to the filepath: "output/pmi/BarChart.png".
INFO: Creating a BarChart object...
INFO: Saving it to the filepath: "output/stats/BarChart.html".
INFO: Saving it to the filepath: "output/stats/BarChart.png".


In [31]:
x["pmi"][0]

<variationist.visualization.bar_chart.BarChart at 0x7f1076038760>