In [1]:
import pandas as pd
import seaborn as sns
sns.set_theme()
sns.set(rc={'figure.figsize':(11,8)})
from matplotlib import pyplot as plt

df_raw = pd.read_csv('merged_600k.csv')

def dropped_nan_views(df):
    columns = [str(monthno) for monthno in range(1, 13)]
    return df.dropna(axis=0, subset=columns)

df = dropped_nan_views(df_raw).copy()
print(f"Went from {len(df_raw)} to {len(df)} rows after dropping nans.")

df.head()

Went from 600000 to 568998 rows after dropping nans.


Unnamed: 0,page_id,page_title,page_random,gap,1,2,3,4,5,6,7,8,9,10,11,12,total
0,1711095,USS_Bulmer,7.8414e-08,,121.0,82.0,89.0,74.0,73.0,59.0,79.0,68.0,70.0,67.0,62.0,71.0,915
1,126662,"Palatine_Bridge,_New_York",0.7465332,0.0,231.0,210.0,234.0,197.0,251.0,239.0,215.0,195.0,186.0,218.0,186.0,176.0,2538
2,934819,Lithium_nitrate,0.1458007,0.0,1996.0,1783.0,1902.0,1516.0,1338.0,1314.0,1108.0,1046.0,1506.0,1530.0,1441.0,1244.0,17724
3,127803,"Lattimore,_North_Carolina",0.9447616,0.0,208.0,220.0,176.0,175.0,168.0,153.0,168.0,175.0,189.0,162.0,123.0,158.0,2075
4,58768,Vicente_Aleixandre,0.1979226,0.0,841.0,860.0,1037.0,1102.0,879.0,717.0,658.0,794.0,772.0,970.0,796.0,929.0,10355


In [2]:
df.sort_values(by='total').head(50)

Unnamed: 0,page_id,page_title,page_random,gap,1,2,3,4,5,6,7,8,9,10,11,12,total
133268,42356525,Trichromia_phaeocrota,0.726997,3.576e-09,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3
36623,42022068,Opharus_corticea,0.308701,9.59e-10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3
583447,63572986,Wanshousi_station,0.231701,1.629e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3
6414,51230438,Yimnashana_hamulata,0.840232,1.58243e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,4
46275,30512602,Telihigala,0.891961,1.218e-09,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,4
259624,45211443,Hybolasiopsis_abnormalis,0.084192,7.051e-09,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,4
16540,47711283,Gnorimoschema_debenedictisi,0.187145,4.24e-10,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,5
175140,50439976,Pseudoneuroterus_mazandarani,0.715915,4.716447e-09,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,5
254342,30495695,Deiannewela,0.337238,6.907e-09,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,5
77191,13521162,Phenacoceratidae,0.679262,2.053e-09,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,5


In [3]:
df.gap.describe()

count    5.689970e+05
mean     8.240131e-09
std      4.843563e-09
min      0.000000e+00
25%      4.033000e-09
50%      8.169000e-09
75%      1.241500e-08
max      1.677500e-08
Name: gap, dtype: float64

In [18]:
(df.gap.sample(10) * 10**9)

582718    16.268
533221    14.812
29250      0.764
240810     6.528
436150    12.016
284867     7.751
81664      2.173
449918    12.413
457382    12.626
580565    16.206
Name: gap, dtype: float64

In [29]:
# Maybe better to export as a table?
def export_links_list(df):
    res = ''
    for _, row in df.iterrows():
        res += '* [[' + row.page_title.replace('_', ' ') + ']]\n'
    return res

def _wiki_row(r, annotate):
    title = r['page_title'].replace('_', ' ')
    if annotate:
        link = r'{{Annotated link|' + title + r'}}'
    else:
        link = '[[' + title + ']]'
    return "|-\n| {} || {} || {:.3f} || {}".format(
            link,
            r.total,
            r.gap * 10**9,
            r'{{PAGESIZE:' + title + r'}}',
    )


def export_table(df, annotate=True):
    # TODO: add a note about noise. e.g. Wanshousi station.
    preamble = ("Putative least-viewed mainspace articles (not including disambiguation pages) in 2021. "
                'For ease of reading, the "Random gap" column has been multiplied by 1 billion. '
                'In other words, they should all be read as being followed by e-9 in [[scientific notation]]. '
                'The average random gap value across all articles is about 170 on this scale (i.e. 1.7e-7).\n\n'
               )
    t = preamble + """{| class="wikitable sortable"
|-
! Title !! 2021 views !! Random gap (parts per billion) !! Current size (bytes)
"""
    for _, row in df.iterrows():
        t += _wiki_row(row, annotate) + '\n'
    t += r'|}'
    return t
    
    

n = 500
sub = df.sort_values(by='total').head(n)
s = export_links(sub)
t = export_table(sub)
print(t)

Putative least-viewed mainspace articles (not including disambiguation pages) in 2021. For ease of reading, the "Random gap" column has been multiplied by 1 billion. In other words, they should all be read as being followed by e-9 in [[scientific notation]]. The average random gap value across all articles is about 170 on this scale (i.e. 1.7e-7).

{| class="wikitable sortable"
|-
! Title !! 2021 views !! Random gap (parts per billion) !! Current size (bytes)
|-
| {{Annotated link|Trichromia phaeocrota}} || 3 || 3.576 || {{PAGESIZE:Trichromia phaeocrota}}
|-
| {{Annotated link|Opharus corticea}} || 3 || 0.959 || {{PAGESIZE:Opharus corticea}}
|-
| {{Annotated link|Wanshousi station}} || 3 || 16.290 || {{PAGESIZE:Wanshousi station}}
|-
| {{Annotated link|Yimnashana hamulata}} || 4 || 0.158 || {{PAGESIZE:Yimnashana hamulata}}
|-
| {{Annotated link|Telihigala}} || 4 || 1.218 || {{PAGESIZE:Telihigala}}
|-
| {{Annotated link|Hybolasiopsis abnormalis}} || 4 || 7.051 || {{PAGESIZE:Hybolasiopsi