### Language modelling in PyTorch

In this tutorial, we will build our language model using our downloaded sermons to construct our sermon bot for twitter. PyTorch was chosen due to its dynamic nature and tight integration with the Python programming language. 

In [1]:
import torch
import pandas as pd
import spacy
from tqdm import tqdm_notebook as tqdm
from ipywidgets import IntProgress

In [2]:
# load main spacy model
nlp = spacy.load('en_core_web_lg')

In [3]:
# read in sermon data
df = pd.read_csv('Sermons/combined_sermons.csv')

In [4]:
# we will want to split sermons into paragraphs and ultimately into sentences. 
# tweets are short snippets
df['sermon_text'].values[0].split('\n\n')

sermons = []

for index, row in df.iterrows():
    text = row['sermon_text'].split('\n\n')
    # filter text
    text = [p.strip() for p in text if len(p) >= 10]
    # construct row
    tmp = pd.DataFrame({'paragraph': text})
    tmp['author'] = row['author']
    tmp['author_url'] = row['author_url']
    tmp['title'] = row['title']
    tmp['url'] = row['url']
    sermons.append(tmp)

In [5]:
sermons = pd.concat(sermons)

In [6]:
docs = sermons['paragraph'].tolist()
docids = list(range(sermons.shape[0]))
sermons['docids'] = docids

In [8]:
clean_docs = []

for doc in tqdm(nlp.pipe(docs, n_threads=8, batch_size=10), total=len(docs)):
    alltokens = [token.lemma_ for token in doc]
    clean_docs.append(alltokens)

HBox(children=(IntProgress(value=0, max=27680), HTML(value='')))

KeyboardInterrupt: 