In [1]:
import sys
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/Colab Notebooks/markov-language-model
sys.path.append('/content/drive/MyDrive/Colab Notebooks/markov-language-model')

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/markov-language-model


In [41]:
import numpy as np
import pandas as pd
from markovmodel import MarkovModel
import requests
from bs4 import BeautifulSoup

In [3]:
get_url = 'https://www.telegraph.co.uk/boris-johnson-archive'
get_url_list = [get_url]

for i in range(2,9): get_url_list.append( get_url+'/page-'+str(i) )

In [4]:
get_url_list

['https://www.telegraph.co.uk/boris-johnson-archive',
 'https://www.telegraph.co.uk/boris-johnson-archive/page-2',
 'https://www.telegraph.co.uk/boris-johnson-archive/page-3',
 'https://www.telegraph.co.uk/boris-johnson-archive/page-4',
 'https://www.telegraph.co.uk/boris-johnson-archive/page-5',
 'https://www.telegraph.co.uk/boris-johnson-archive/page-6',
 'https://www.telegraph.co.uk/boris-johnson-archive/page-7',
 'https://www.telegraph.co.uk/boris-johnson-archive/page-8']

In [5]:
scraped_urls = []
prepend = 'http://www.telegraph.co.uk'

for x in get_url_list:

    page = requests.get(x)
    if page.status_code !=200: raise RuntimeError(f'requests.get failed {page.status_code}')
    soup = BeautifulSoup( page.text, 'html.parser' )
    articles = soup.find_all('article')
    for article in articles:
        links = article.find_all('a')
        for link in links:
            scraped_urls.append( prepend + link.get('href') )

In [6]:
len(scraped_urls) , len(set(scraped_urls))

(739, 276)

In [7]:
scraped_urls = list( set(scraped_urls) )
scraped_urls[:10]

['http://www.telegraph.co.uk/politics/0/simple-way-keep-law-order-make-everyone-kiss-cuddle/',
 'http://www.telegraph.co.uk/politics/0/financial-crisis-eat-spend-merry-not-end-world/',
 'http://www.telegraph.co.uk/politics/0/bbc-investigation-smearing-innocent-mans-name-real-tragedy/',
 'http://www.telegraph.co.uk/politics/0/eu-crisis-greek-austerity-diet-will-leave-feeling-fed/',
 'http://www.telegraph.co.uk/politics/0/tests-say-have-leukaemia-hang-mo-cant-right/',
 'http://www.telegraph.co.uk/politics/2019/05/12/corbyn-style-socialism-cannot-provide-resources-magnificent/',
 'http://www.telegraph.co.uk/politics/0/britain-wont-create-facebook-learn-praise-success/',
 'http://www.telegraph.co.uk/politics/0/special-relationship-one-way-street/',
 'http://www.telegraph.co.uk/politics/0/justice-put-sword-moscows-greed-corruption/',
 'http://www.telegraph.co.uk/politics/0/mick-jaggers-sir-cant-keith-richards-have-satisfaction/']

In [8]:
from tqdm.notebook import tqdm
article_list = []

# request.get() can be fairly slow on google colab
# add a tqdm progress bar to make the wait less frustrating

for url in tqdm(scraped_urls):
    page = requests.get(url)

    # it turns out some of the scraped_url's result in a 404
    # error. Rather than terminating this cell and loosing
    # all progress, print error code and url then skip

    if page.status_code !=200:
        print(f'requests.get failed {page.status_code}')
        print(f'URL {url}')
        continue

    out_string = str()

    page_soup = BeautifulSoup(page.text,'html.parser')
    article = page_soup.article
    paragraph = article.find_all('p')
    for p in paragraph:
        out_string += p.getText()

    article_list.append(out_string)


  0%|          | 0/276 [00:00<?, ?it/s]

requests.get failed 404
URL http://www.telegraph.co.uk/news/2019/04/21/dear-extinction-rebellion-aims-worthy-take-pink-boat-china-instead/


In [9]:
text = ' '.join(article_list)
len( text.split() )

282419

In [16]:
# this cell can take >45mins to execute
# (google colab pro instance Oct 2023)

model_n2 = MarkovModel(text, n=2)
model_n10 = MarkovModel(text, n=10)
model_n25 = MarkovModel(text, n=25)
model_n50 = MarkovModel(text, n=50)

In [42]:
length = 500

a,b,c,d = (model_n2.generate(length), model_n10.generate(length),
           model_n25.generate(length), model_n50.generate(length))

context_len = pd.Series(name='Context String Length', data=[2,10,25,50])
text = pd.Series(name='Generated Text', data=(a,b,c,d))

# some formatting to aid readability
out = pd.DataFrame((context_len,text)).T.style.set_properties(**{'text-align': 'left'})
out.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])

Unnamed: 0,Context String Length,Generated Text
0,2,"410 and mong, plonal reraliall we sion aboyals. Thishe sed, adic we an faimity zon the's thien’t then feet to thelfice feeph, hat hiliggerst youtax a com ans rep itinto that the agenturial (an am ahand betive-artrivers thaterbinell ned he call crest dompok and hatichatch des, twored IbnK “rusially shent mourment, is he go of acked theing the se Lonmeals and woul in thed thared ond the enits, to gesused, is It’s now theignithe rup of for onat distion, the ned whonnothe it the con. Stry hisfortyro"
1,10,"re the bodies L ion and – crash – there would beam Y been a hell of a weekend. I8 sure.InsteA as finished tryi- risks. LonA some disaster:x up artists whoh three of the brighm es provably stimula3 at has scope to take ovt aly, the democratic capitalism. It’s about ideasL out the assi! od people battlingY nly deepening tn be just a fluk$ , the trumped-up R ut Pol-U-Swerve, so h e got to Heaz an has 7 sq ft of self-storage hangars. c arse every word a po— ritish history. fluous banal& a race of c"
2,25,"head, you can physically sK le beer – “probably his greatest cont& n May that he first predicted a snowy c meron’s hand. Across the Con- -bargy that 4,500 police oC e Anglo-American toppling of Sadda' be – and on that crucial issue, wé consistently ahead of the Met Office, “ o like playing Call of Duty a8 e Tube. In the next term we mm ed for 30 centuries?It seems unlikelp f like some frenzied bacilb on the insouciant system X m reality.A villain? Let’s’ to simplify. We should take it.’ turn to the hard-pressed t"
3,50,"dn't see. My thyroid was dandy and so was my renal fd in the past few days by the number of parents who have come up to me Q accepted, because it already has been accepted – byy ?To call someone a paedophile is to place them, thesef men of power would attend his shambolic morning levee in theu ry plant and form of life, and one day the Sun will tm has ever given us.When I first became an MP our numbers had beene Berlin Wall – the ultimate triumph of simple human instincts over an evil and degene"
