In [2]:
import pandas as pd
import numpy as np
import ast
import re

In [3]:
df = pd.read_csv('/dlabdata1/lugeon/datasets/wikilinks.gz', index_col=0)

In [4]:
df.head(1)

Unnamed: 0,page_id,title,link,fullcat,cat1,cat2,cat3
0,6632119,Gladstone Pottery Museum,www.stokemuseums.org.uk/visit/gpm,"Culture.Visual arts.Architecture,Culture.Visua...",Culture,Visual arts,"Architecture,Visual arts*"


In [23]:
df = df[df.cat1.notna()]
df = df[df.cat2.notna()]
df = df[df.link.notna()]

In [29]:
def clean_url(url):
    url = re.sub(r"www.|http://|https://|-|_", '', url)
    return url.split('.')[0]

In [33]:
df.link = df.link.apply(lambda url: re.sub(r"http://|https://", '', url.strip()))

### Exploring the links

In [34]:
df.link.apply(lambda x: x.startswith('www')).value_counts()

True     353577
False    144642
Name: link, dtype: int64

In [35]:
df.link.apply(lambda x: x.startswith('http')).value_counts()

False    498216
True          3
Name: link, dtype: int64

In [38]:
df.link.apply(lambda x: '//' in x).value_counts()

False    497963
True        256
Name: link, dtype: int64

In [40]:
df[df.link.apply(lambda x: '//' in x)].head(5)

Unnamed: 0,page_id,title,link,fullcat,cat1,cat2,cat3
3707,10896890,RusNet,irc://irc.rus-net.org,"Geography.Regions.Asia.Asia*,Geography.Regions...","Culture,Geography","Internet culture,Regions","Asia,Europe"
4794,7713775,Pemberton Township High School,pemberton.schoolwires.net//site/Default.aspx?P...,"History and Society.Education,Geography.Region...","Geography,History and Society","Education,Regions",Americas
5436,1155769,SlashNET,irc://irc.slashnet.org,Culture.Internet culture,Culture,Internet culture,
9161,398847,Apache Nutch,//nutch.apache.org,"STEM.Technology,Culture.Media.Media*,Culture.M...","Culture,STEM","Computing,Media,STEM*,Technology","Media*,Software"
9308,23477385,KAJT,mms://206.192.61.163/sonlife,"Geography.Regions.Americas.North America,Cultu...","Culture,Geography","Media,Regions","Americas,Media*,Radio"


In [41]:
def is_homepage(link):
    return not('/' in link) or (link.count('/') == 1 and link.endswith('/'))

In [44]:
df.link.apply(is_homepage).value_counts()

True     399046
False     99173
Name: link, dtype: int64

In [45]:
df_hp = df[df.link.apply(is_homepage)]

### Using cat 1

In [66]:
df_hp.cat1.apply(lambda x: x.split(',')).apply(lambda x: len(x)).value_counts()

1    195590
2    161528
3     39552
4      2376
Name: cat1, dtype: int64

In [67]:
mask_cat1_unique = df_hp.cat1.apply(lambda x: x.split(',')).apply(lambda x: len(x)) == 1
df_cat1 = df_hp[mask_cat1_unique]

In [68]:
df_cat1.cat1.value_counts()

Geography              121415
Culture                 57242
History and Society     12173
STEM                     4760
Name: cat1, dtype: int64

In [69]:
df_hp.cat1.apply(lambda x: 'STEM' in x).value_counts()

False    366267
True      32779
Name: cat1, dtype: int64

In [70]:
df_hp.reset_index(inplace=True, drop=True)

In [71]:
categories = df_cat1.cat1.unique()

df_train = pd.DataFrame([])
df_test = pd.DataFrame([])

nsamples = 30_000
test_frac = 0.2

for c in categories:
    mask = df_hp.cat1.apply(lambda x: c in x)
    df_c = df_hp[mask]
    df_c = df_c.sample(nsamples)[['page_id', 'link']]
    df_c['cat1'] = [re.sub(r' ', '_', c)] * nsamples
    ix = np.arange(nsamples)
    np.random.shuffle(ix)
    sep = int(nsamples * test_frac)
    df_train = pd.concat((df_train, df_c.iloc[ix[sep:]]))
    df_test = pd.concat((df_test, df_c.iloc[ix[:sep]]))

In [72]:
df_train.head(5)

Unnamed: 0,page_id,link,cat1
390526,7305916,columbiacitypaper.com,Culture
83322,18586680,key.smtown.com,Culture
62575,19094997,www.nativefederation.org,Culture
14958,17368275,www.scandal-4.com,Culture
72569,47437477,www.pakswim.com,Culture


In [73]:
df_test.head(5)

Unnamed: 0,page_id,link,cat1
173715,3781075,www.restarts.co.uk,Culture
97519,10952024,ftisland-official.jp,Culture
79318,13940895,www.tylorstown.rfc.wales,Culture
244939,2288327,sarahdash.net,Culture
76026,40749041,www.overvaalstereo.co.za,Culture


In [74]:
df_train.shape, df_test.shape

((96000, 3), (24000, 3))

In [75]:
df_train.to_csv('/dlabdata1/lugeon/websites_wiki_30000_4cat1_train.gz', compression='gzip')
df_test.to_csv('/dlabdata1/lugeon/websites_wiki_30000_4cat1_valid.gz', compression='gzip')

### Using cat 2

In [41]:
allcat2 = pd.Series([c_ for c in df.cat2 for c_ in ast.literal_eval(c)])

In [42]:
allcat2.value_counts()

Regions                    380155
Media                      100931
Biography                   97726
Education                   53101
Business and economics      48285
STEM*                       46325
Sports                      36357
Politics and government     32568
Geographical                30846
Visual arts                 24130
Transportation              19324
Philosophy and religion     15187
Literature                  14738
Technology                  13740
Society                     13699
Engineering                 10177
Computing                    9441
Internet culture             8708
Medicine & Health            6377
Food and drink               6013
Military and warfare         4776
History                      3848
Biology                      3268
Performing arts              3086
Earth and environment        1956
Libraries & Information      1923
Space                        1897
Linguistics                   777
Physics                       572
Chemistry     

In [79]:
selected_cat2 = ['Biography', 'Business and economics', 'Education', 'Visual arts', 'Geographical', 'Politics and government', 'Sports', 'Transportation']

In [80]:
df.link.apply(lambda url: not('/' in str(url))).value_counts()

True     402278
False    100778
Name: link, dtype: int64

In [81]:
for c in selected_cat2:
    mask = df.cat2.apply(lambda x: c in x)
    df_c = df[mask]
    df_c = df_c[df_c.link.apply(lambda url: not('/' in str(url)))]
    print('{} : {} valid urls'.format(c, df_c.shape[0]))

Biography : 81330 valid urls
Business and economics : 43255 valid urls
Education : 43122 valid urls
Visual arts : 18429 valid urls
Geographical : 24289 valid urls
Politics and government : 25189 valid urls
Sports : 29908 valid urls
Transportation : 10678 valid urls


In [95]:
df_train = pd.DataFrame([])
df_test = pd.DataFrame([])

nsamples = 10_000
test_frac = 0.2

for c in selected_cat2:
    mask = df.cat2.apply(lambda x: c in x)
    df_c = df[mask]
    df_c = df_c[df_c.link.apply(lambda url: not('/' in str(url)))]
    df_c = df_c.sample(nsamples)[['page_id', 'link']]
    df_c['cat2'] = [re.sub(r' ', '_', c)] * nsamples
    ix = np.arange(nsamples)
    np.random.shuffle(ix)
    sep = int(nsamples * test_frac)
    df_train = pd.concat((df_train, df_c.iloc[ix[sep:]]))
    df_test = pd.concat((df_test, df_c.iloc[ix[:sep]]))

[5001 7350 5608 ... 2379 5090 2225]
[4106 4973 2717 ... 7893  769 8140]
[6388 4582 4443 ... 5843  641 8586]
[6925 6345 9058 ...  660 1904 3562]
[7726 1137 6345 ... 6843 5224 6945]
[1870 4038 2941 ... 6683  333 2938]
[3968  459 9344 ... 2714 7677 9583]
[3483 6638 2514 ... 5314 8686 1023]


In [83]:
df_train.shape

(64000, 3)

In [84]:
df_test.shape

(16000, 3)

In [85]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [91]:
df_train.to_csv('/dlabdata1/lugeon/websites_wiki_10_000_9cat_train.gz', compression='gzip')
df_test.to_csv('/dlabdata1/lugeon/websites_wiki_10_000_9cat_valid.gz', compression='gzip')

In [92]:
pd.read_csv('/dlabdata1/lugeon/websites_wiki_10_000_9cat_train.gz', header=0, names=['uid', 'url', 'cat0'])

Unnamed: 0,uid,url,cat0
0,53444543,vvnijnsel.nl,Biography
1,408803,www.petestark.com,Biography
2,56769455,www.ghostlightband.com,Biography
3,25293838,www.thecoalporters.com,Biography
4,41082806,senatorpeterwirth.com,Biography
...,...,...,...
63995,23121059,www.polestar.com,Transportation
63996,47528895,en.motcmpb.gov.tw,Transportation
63997,275681,www.ansett.com.au,Transportation
63998,4579741,www.vag.de,Transportation
