Analysis involving existence of wiktionary entries.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

from heatmap import make_heatmap, matricize_df
import viz_helpers

# Seaborn style defaults. Relevant changes for us are default font, and no ticks for heatmaps
sns.set_theme()

pd.set_option('display.max_rows', 100)

# Default (all reddit) counts
df = viz_helpers.load_df(wikt=True)

# NB: Special-casing this because of false positive on non-English term
df.loc[
    (df.pre == 'nazi') & (df.suff == 'ster'),
    'wikt'
] = True

mat = matricize_df(df)
df.head(5)

Unnamed: 0,pre,suff,count,wikt
0,cum,wad,372.0,True
1,cum,bag,1608.0,False
2,cum,bucket,4441.0,True
3,cum,sack,212.0,False
4,cum,ball,654.0,True


In [2]:
nonzero = df['count'] > 0
haswikt = df['wikt']

print(f"{len(df):,} total compounds, {nonzero.sum():,} having at least one occurrence on Reddit. {haswikt.sum()} have Wiktionary entries.")

4,818 total compounds, 3,589 having at least one occurrence on Reddit. 385 have Wiktionary entries.


In [3]:
# Most popular terms without wiki entries
missing = df.loc[~haswikt].sort_values(by='count', ascending=False).head(10)
missing

Unnamed: 0,pre,suff,count,wikt
2067,gay,bro,12267.0,False
619,shit,rag,7295.0,False
1023,ass,bag,6883.0,False
1696,tit,fucker,6827.0,False
1928,dumb,bitch,6430.0,False
1228,dick,nose,5890.0,False
3893,fem,nazi,5617.0,False
2094,gay,ass,5127.0,False
4115,soy,boi,4699.0,False
2071,gay,boi,4532.0,False


In [4]:
import os
def pprint(df, link=True):
    xf = df.copy()
    xf['term'] = df['pre'] + df['suff']
    if link:
        xf['term'] = '[' + xf['term'] + '](' + 'https://en.wiktionary.org/wiki/' + xf['term'] + ')'
    xf['count'] = xf['count'].astype(int)
    xf['count'] = xf['count'].apply(lambda n: '{:,}'.format(n))
    cols = ['term', 'count']
    return xf.loc[:, cols].to_markdown(index=False)

md = pprint(missing)
print(md)

| term                                                  | count   |
|:------------------------------------------------------|:--------|
| [gaybro](https://en.wiktionary.org/wiki/gaybro)       | 12,267  |
| [shitrag](https://en.wiktionary.org/wiki/shitrag)     | 7,295   |
| [assbag](https://en.wiktionary.org/wiki/assbag)       | 6,883   |
| [titfucker](https://en.wiktionary.org/wiki/titfucker) | 6,827   |
| [dumbbitch](https://en.wiktionary.org/wiki/dumbbitch) | 6,430   |
| [dicknose](https://en.wiktionary.org/wiki/dicknose)   | 5,890   |
| [femnazi](https://en.wiktionary.org/wiki/femnazi)     | 5,617   |
| [gayass](https://en.wiktionary.org/wiki/gayass)       | 5,127   |
| [soyboi](https://en.wiktionary.org/wiki/soyboi)       | 4,699   |
| [gayboi](https://en.wiktionary.org/wiki/gayboi)       | 4,532   |


In [5]:
MD_DIR = os.path.expanduser('~/src/colinmorris.github.com/_includes/compound_curses/')
path = os.path.join(MD_DIR, 'missing_terms_table.md')
with open(path, 'w') as f:
    f.write(md)

In [6]:
# *Least* popular terms *with* wiki entries
rares = df.loc[haswikt].sort_values(by='count', ascending=True).head(10)
rares

Unnamed: 0,pre,suff,count,wikt
365,puke,wad,1.0,True
2142,homo,whore,2.0,True
3211,scum,breath,2.0,True
2847,slut,wad,3.0,True
1434,knob,twat,3.0,True
3835,bird,weed,5.0,True
3342,dirt,brains,6.0,True
2521,dork,weed,7.0,True
2878,slut,waffle,11.0,True
417,puke,balls,13.0,True


In [7]:
md = pprint(rares)
print(md)

path = os.path.join(MD_DIR, 'rare_wikt_terms_table.md')
with open(path, 'w') as f:
    f.write(md)

| term                                                    |   count |
|:--------------------------------------------------------|--------:|
| [pukewad](https://en.wiktionary.org/wiki/pukewad)       |       1 |
| [homowhore](https://en.wiktionary.org/wiki/homowhore)   |       2 |
| [scumbreath](https://en.wiktionary.org/wiki/scumbreath) |       2 |
| [slutwad](https://en.wiktionary.org/wiki/slutwad)       |       3 |
| [knobtwat](https://en.wiktionary.org/wiki/knobtwat)     |       3 |
| [birdweed](https://en.wiktionary.org/wiki/birdweed)     |       5 |
| [dirtbrains](https://en.wiktionary.org/wiki/dirtbrains) |       6 |
| [dorkweed](https://en.wiktionary.org/wiki/dorkweed)     |       7 |
| [slutwaffle](https://en.wiktionary.org/wiki/slutwaffle) |      11 |
| [pukeballs](https://en.wiktionary.org/wiki/pukeballs)   |      13 |
