In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 5.5
fig_height = 3.5
fig_format = 'pdf'
fig_dpi = 300

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/home/cjber/drive/phd/papers/reddit-footprint':
  os.chdir(r'/home/cjber/drive/phd/papers/reddit-footprint')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define




In [2]:
import warnings

import polars as pl
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import Markdown, display

from src.common.utils import Paths, process_outs

warnings.filterwarnings("ignore")

plt.rcParams.update(
    {
        "font.size": 6,
        "text.usetex": False,
        "font.family": "sans-serif",
        "font.sans-serif": ["DejaVu Sans"],
    }
)

places, regions, lad, region_embeddings, lad_embeddings = process_outs()

In [3]:
all_comments = (
    pl.scan_parquet(Paths.RAW / "comments_combined-2023_02_23.parquet")
    .select(
        [
            pl.col("score").count().alias("count"),
            pl.col("author").n_unique().alias("n_authors"),
            pl.col("created_utc")
            .min()
            .alias("first_utc")
            .apply(lambda s: s.strftime("%Y-%m-%d")),
            pl.col("created_utc")
            .max()
            .alias("last_utc")
            .apply(lambda s: s.strftime("%Y-%m-%d")),
            pl.col("text").str.split(by=" ").explode().len().alias("total_words"),
        ]
    )
    .collect()
)

display(
    Markdown(
        rf"""
[Reddit](https://reddit.com) is a public discussion, news aggregation social network, and among the top 20 most visited websites in the United Kingdom. In 2020, Reddit had around 430 million active monthly users, comparable to the number of Twitter users [@murphy2019;@statista2022]. Reddit is divided into separate independent _subreddits_ each with specific topics of discussion, where _users_ may submit _posts_ which each have dedicated nested conversation threads that users can add _comments_ to. Subreddits cover a wide range of topics, and in the interest of geography, they also act as forums for the discussion of local places. The [United Kingdom subreddit](https://reddit.com/r/unitedkingdom) acts as a general hub for related topics, notably including a list of smaller and more specific related subreddits. This list provides a 'Places' section, a collection of local British subreddits, ranging in scale from country (`/r/England`), region (`/r/thenorth`, `/r/Teeside`), to cities (`/r/Manchester`) and small towns (`/r/Alnwick`). In total there are 213 subreddits that relate to 'places' within the United Kingdom^[https://www.reddit.com/r/unitedkingdom/wiki/british_subreddits]. io/) Reddit archive [@baumgartner2020] by @berragan. we use the corpus generated by @berragan, which consists of a collection of all Reddit comments taken from each UK related subreddit, with place names identified by a custom transformer-based named entity recognition model^[https://huggingface.co/cjber/reddit-ner-place_names]. In total {all_comments['count'][0]:,} comments were extracted, submitted by {all_comments['n_authors'][0]:,} unique users, between {all_comments['first_utc'][0]} and {all_comments['last_utc'][0]}. \ref{{tbl-example}} gives an example entry from this geoparsed Reddit corpus.

In total our corpus consisted of {places[['word', 'easting', 'northing']].n_unique():,} unique locations, with a highly skewed distribution in mentions. Most locations were only mentioned a single time, while 'London' was mentioned in almost 300,000 comments. To reduce this skew, sampled any location mentioned more than 5,000 times, retaining only up to 5,000 randomly sampled comments. The goal with this processing was to ensure that our generated embeddings did not simply become biased towards the word embedding for a single location, and instead capture a broader sense of an aggregate administrative region.
"""
    )
)


[Reddit](https://reddit.com) is a public discussion, news aggregation social network, and among the top 20 most visited websites in the United Kingdom. In 2020, Reddit had around 430 million active monthly users, comparable to the number of Twitter users [@murphy2019;@statista2022]. Reddit is divided into separate independent _subreddits_ each with specific topics of discussion, where _users_ may submit _posts_ which each have dedicated nested conversation threads that users can add _comments_ to. Subreddits cover a wide range of topics, and in the interest of geography, they also act as forums for the discussion of local places. The [United Kingdom subreddit](https://reddit.com/r/unitedkingdom) acts as a general hub for related topics, notably including a list of smaller and more specific related subreddits. This list provides a 'Places' section, a collection of local British subreddits, ranging in scale from country (`/r/England`), region (`/r/thenorth`, `/r/Teeside`), to cities (`/r/Manchester`) and small towns (`/r/Alnwick`). In total there are 213 subreddits that relate to 'places' within the United Kingdom^[https://www.reddit.com/r/unitedkingdom/wiki/british_subreddits]. io/) Reddit archive [@baumgartner2020] by @berragan. we use the corpus generated by @berragan, which consists of a collection of all Reddit comments taken from each UK related subreddit, with place names identified by a custom transformer-based named entity recognition model^[https://huggingface.co/cjber/reddit-ner-place_names]. In total 8,282,331 comments were extracted, submitted by 490,535 unique users, between 2011-01-01 and 2022-04-17. \ref{tbl-example} gives an example entry from this geoparsed Reddit corpus.

In total our corpus consisted of 52,169 unique locations, with a highly skewed distribution in mentions. Most locations were only mentioned a single time, while 'London' was mentioned in almost 300,000 comments. To reduce this skew, sampled any location mentioned more than 5,000 times, retaining only up to 5,000 randomly sampled comments. The goal with this processing was to ensure that our generated embeddings did not simply become biased towards the word embedding for a single location, and instead capture a broader sense of an aggregate administrative region.


In [4]:
# | output: 'asis'

variable = [
    "text",
    "",
    "word",
    "easting",
    "northing",
    "region",
    "lad",
    "author",
    "word\_count",
    "author\_count",
]
value = [
    "A Mexicana meal with extra wings ",
    "from Tex in Leytonstone.",
    "leytonstone",
    539268,
    187540,
    "London",
    "Waltham Forest",
    "t2\_eklyq",
    855,
    431,
]
desc = [
    "Comment",
    "",
    "Identified Place Name",
    "Place Name Easting",
    "Place Name Northing",
    "Administrative Region",
    "Local Authority District",
    "Anonymised Unique Author ID",
    "Total location mentions",
    "Unique authors mentioning this location",
]

print(
    pd.DataFrame({"Variable": variable, "Value": value, "Description": desc})
    .style
    .format(thousands=",")
    .hide_index()
    .to_latex(
        hrules=True,
        label="tbl-example",
        caption="Summary of comments relating to each region in our study",
        position="tb",
        position_float="centering",
        convert_css=True
    )
)

\begin{table}[tb]
\centering
\caption{Summary of comments relating to each region in our study}
\label{tbl-example}
\begin{tabular}{lll}
\toprule
Variable & Value & Description \\
\midrule
text & A Mexicana meal with extra wings  & Comment \\
 & from Tex in Leytonstone. &  \\
word & leytonstone & Identified Place Name \\
easting & 539,268 & Place Name Easting \\
northing & 187,540 & Place Name Northing \\
region & London & Administrative Region \\
lad & Waltham Forest & Local Authority District \\
author & t2\_eklyq & Anonymised Unique Author ID \\
word\_count & 855 & Total location mentions \\
author\_count & 431 & Unique authors mentioning this location \\
\bottomrule
\end{tabular}
\end{table}



In [5]:
places_count = places.filter(pl.col("author") != "deleted").groupby("author").count()
nnp = int(places_count.quantile(0.99)["count"][0])

display(
    Markdown(
        f"""
We first note on the data quality and bias used in our paper, a common concern in large social media corpora. @berragan note that 1% of all users contribute 32% of all identified place names, which represents the top 2,079 users in the full corpus. In our subset, we similarly find that 1% of users ({len(places_count.filter(pl.col('count') > nnp)):,}) mention {places_count.filter(pl.col('count') > nnp)['count'].sum() / places_count['count'].sum():.0%} of our place names.
"""
    )
)


We first note on the data quality and bias used in our paper, a common concern in large social media corpora. @berragan note that 1% of all users contribute 32% of all identified place names, which represents the top 2,079 users in the full corpus. In our subset, we similarly find that 1% of users (1,698) mention 29% of our place names.


In [6]:
num_comments = places.unique("text").n_unique()
num_words = places["text"].str.split(" ").list.lengths()

display(
    Markdown(
        rf"""
Table \ref{{tbl-sum}} gives an overview of the number of comments, word count and number of places that were identified within each administrative region of the UK. Our study concerns a subset of the full Reddit comment dataset, subsetting each location to a maximum of 5,000 mentions, leaving a total of {num_comments:,} comments containing place names. Comments range from {num_words.min():,} to {num_words.max():,} words in length, with a mean length of {num_words.mean():,.0f}. On Table \ref{{tbl-sum}} the 'Embeddings SD' values give an approximate indication of the intra-community cohesion, lower values indicting stronger cohesion compared with the global average, meaning a higher proportion of shared semantic information.
"""
    )
)


Table \ref{tbl-sum} gives an overview of the number of comments, word count and number of places that were identified within each administrative region of the UK. Our study concerns a subset of the full Reddit comment dataset, subsetting each location to a maximum of 5,000 mentions, leaving a total of 830,770 comments containing place names. Comments range from 1 to 3,555 words in length, with a mean length of 79. On Table \ref{tbl-sum} the 'Embeddings SD' values give an approximate indication of the intra-community cohesion, lower values indicting stronger cohesion compared with the global average, meaning a higher proportion of shared semantic information.


In [7]:
# | output: asis

from paper.tables import desc_tbl

print(
    desc_tbl(places, lad_embeddings).to_latex(
        hrules=True,
        label="tbl-sum",
        caption="Summary of comments relating to each region in our study",
        # environment="longtable",
        position="tb",
        position_float="centering",
    )
)

\begin{table}[tb]
\centering
\caption{Summary of comments relating to each region in our study}
\label{tbl-sum}
\begin{tabular}{lrrrrr}
\toprule
RGN21NM & Total Comments & Unique Words & Word Count & Total Places & Embeddings SD \\
\midrule
Scotland & 181,831 & 437,746 & 23,218,279 & 8,052 & 1.24 \\
South East & 107,134 & 308,491 & 11,849,441 & 5,679 & -0.89 \\
London & 206,280 & 422,036 & 23,868,430 & 5,164 & 2.14 \\
South West & 85,960 & 267,388 & 9,680,548 & 5,090 & 0.55 \\
North West & 88,789 & 259,154 & 10,650,018 & 4,893 & 0.09 \\
Yorkshire and The Humber & 68,703 & 214,932 & 7,913,376 & 4,669 & 0.74 \\
East of England & 53,374 & 202,474 & 5,718,764 & 3,614 & -1.12 \\
East Midlands & 37,521 & 145,982 & 4,353,188 & 3,078 & -0.84 \\
West Midlands & 39,390 & 168,555 & 4,824,586 & 3,029 & -0.84 \\
Wales & 32,660 & 137,766 & 4,156,235 & 2,647 & -0.59 \\
North East & 25,053 & 115,418 & 2,927,627 & 1,787 & -0.49 \\
\midrule \bfseries Total & \bfseries 830,770 & \bfseries 1,239,471 & \bf

In [8]:
#| label: fig-clusters
#| fig-cap: Average transformer vector associated with each location corpus coloured by K Means clusters where $K=5$. (A) PCA decomposed into 2 dimensions. (B) Geographic location of clusters.

from paper.figures import plt_place_vectors, process_embeddings

plt_place_vectors(lad_embeddings, regions)
plt.show()

<Figure size 2400x1800 with 3 Axes>

In [9]:
#| label: fig-morans
#| fig-cap: 'Moran''s I Plot: LAD embeddings decomposed into 1 dimension and standardised against their spatial lag.'

from paper.figures import plt_morans, process_moran

lad_embeddings, w, explained = process_moran(lad_embeddings)
plt_morans(lad_embeddings, w, explained)
plt.show()

  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)


  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)


  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)


  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)


  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
  rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)


<Figure size 1650x1050 with 1 Axes>

In [10]:
#| label: fig-lisa
#| fig-cap: 'Local Indicators of Spatial Auto-correlation (LISA). (A) Raw plottted 1-dim embeddings. (B) Local Moran''s I values ($Is$). (C) LISA HH and LL significant values ($p<0.05$), both are included as the value of embeddings do not convey information.'

from paper.figures import plt_lisa

plt_lisa(lad_embeddings, [0, 1])
plt.show()

<Figure size 3600x2400 with 10 Axes>

In [11]:
#| label: fig-similarity
#| fig-cap: Cosine similarity of embeddings for administrative regions across the UK. Higher values indicate greater cosine similarity. Regions shown in descending order by mean cosine similarity value.

from paper.figures import plt_similarity

plt_similarity(region_embeddings)
plt.show()

<Figure size 2400x3000 with 13 Axes>

In [12]:
#| label: fig-identity
#| fig-cap: 'Zero Shot classification of each corpus into regional identities; [B]ritish, [E]nglish, [S]cottish, [W]elsh. Values show mean confidence value across each comment, lines indicate 95% confidence intervals. Descending order by [B]ritish confidence.'

from paper.figures import plt_zero_shot


plt_zero_shot(Paths.PROCESSED / "places_zero_shot.parquet")
plt.show()

<Figure size 1650x1050 with 1 Axes>

In [13]:
# | output: asis

from paper.tables import tbl_interpret

print(
    tbl_interpret().to_latex(
        hrules=True,
        label="tbl-interpret",
        caption="Model response to strong national preferences.",
        # environment="longtable",
        position="tb",
        position_float="centering",
    )
)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


\begin{table}[tb]
\centering
\caption{Model response to strong national preferences.}
\label{tbl-interpret}
\begin{tabular}{llr}
\toprule
Sentence & Label & Confidence \\
\midrule
I hate Scotland. & British & 0.54 \\
I love Scotland. & Scottish & 0.97 \\
I hate Wales. & British & 0.58 \\
I love Wales. & Welsh & 0.99 \\
I hate England. & British & 0.48 \\
I love England. & English & 0.53 \\
I live in Manchester. & British & 0.48 \\
I live in Glasgow. & Scottish & 0.64 \\
\bottomrule
\end{tabular}
\end{table}

