### English Tense Table Constructor
This notebook demonstrates how English verb tables for regular/irregular verbs can be automatically-generated using a few rules and regex. The notebook makes use of sre_yield to generate all strings matching a given expression.

In [20]:
# Imports.
import pandas as pd
import bs4
import requests
import re
import sre_yield
from nltk.corpus import cmudict
import nltk
nltk.download('cmudict')
import src.verb_inflect as verb_inflect

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\Christian\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [11]:
# Global variables.
d = cmudict.dict()

In [12]:
# Function definitions.
def nsyl(word):
    return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]

In [26]:
with open('res/regular_verbs.html', 'w') as f:
    f.write(webpage)

In [27]:
# Load regular verb data.
try:
    webpage = requests.get('https://www.englishclub.com/vocabulary/regular-verbs-list.htm').text
    
except Exception:
    with open('res/regular_verbs.html', 'r') as f:
        webpage = f.read()
        
soup = bs4.BeautifulSoup(webpage, 'html.parser')
regulars = []
for body in soup.find_all('tbody'):
    for tr in body.find_all('tr'):
        for td in tr.find_all('td'):
            words = str(td)
            regulars += words.replace('<td>', ' ').replace('<br>', ' ').replace('</br>', ' ').replace('</td>', ' ').replace('(AmE)', ' ').replace('(BrE)', ' ').split()

verb_stems_regular = list(filter(lambda x: len(x) > 0, regulars))

In [28]:
# Process regular verb data.
df_regular = pd.DataFrame(index=[i for i in range(len(verb_stems_regular))], columns=['source'])
filler = [str(0) for i in range(len(df_regular.values))]
df_regular['lemma'] = verb_stems_regular
df_regular['wethey_past'] = filler
df_regular['pastpart'] = filler
df_regular['i_pres'] = filler
df_regular['wethey_pres'] = filler
df_regular['you_pres'] = filler
df_regular['hsi_pres'] = filler
df_regular['prespart'] = filler
df_regular['i_past'] = filler
df_regular['you_past'] = filler
df_regular['hsi_past'] = filler

In [29]:
# Load and process irregular verb data.
df_irregular = pd.read_csv('res/verbs_irregular.csv').drop(['german'], axis=1)
filler = [str(0) for i in range(len(df_irregular.values))]
df_irregular['i_pres'] = filler
df_irregular['wethey_pres'] = filler
df_irregular['you_pres'] = filler
df_irregular['hsi_pres'] = filler
df_irregular['prespart'] = filler
df_irregular['i_past'] = filler
df_irregular['you_past'] = filler
df_irregular['hsi_past'] = filler

In [30]:
# Establish endings for different regular inflection paradigms.
es_endings = ['ch', 'sh', 's', 'x', 'z', 'ss']
cy_endings = []
for each in sre_yield.AllStrings(r'[b-df-hj-np-tv-z]y'):
    cy_endings.append(each)

vc_endings = []
for each in sre_yield.AllStrings(r'[b-df-hj-np-tv-xz][aeiou][b-df-hj-np-tv-xz]'):
    vc_endings.append(each)
    
very_irregular = ['may', 'be', 'do', 'have'] # TODO: account for 'lie' forms

In [31]:
# Irregular verb inflections.
for i in range(len(df_irregular)):
        
	if 'kein Participle' in str(df_irregular.at[i, 'pastpart']):
		df_irregular.at[i, 'pastpart'] = '[None]'
	elif len(df_irregular.at[i, 'pastpart'].split(', ')) >= 2:
		tpl = df_irregular.at[i, 'pastpart'].split(', ')
		df_irregular.at[i, 'pastpart'] = tpl[0]
	elif len(df_irregular.at[i, 'pastpart'].split(', ')) == 1:
		df_irregular.at[i, 'pastpart'] = df_irregular.at[i, 'pastpart']

	df_irregular.at[i, 'i_pres'] = df_irregular.at[i, 'lemma']
	df_irregular.at[i, 'wethey_pres'] = df_irregular.at[i, 'lemma']
	df_irregular.at[i, 'you_pres'] = df_irregular.at[i, 'lemma']
	
	if any(df_irregular.at[i, 'lemma'].endswith(ending) for ending in es_endings):
		df_irregular.at[i, 'hsi_pres'] = df_irregular.at[i, 'lemma'] + 'es'

	elif any(df_irregular.at[i, 'lemma'].endswith(ending) for ending in cy_endings):
		df_irregular.at[i, 'hsi_pres'] = df_irregular.at[i, 'lemma'][:-1] + 'ies'

	else:
		df_irregular.at[i, 'hsi_pres'] = df_irregular.at[i, 'lemma'] + 's'

	if df_irregular.at[i, 'lemma'].endswith('ie'):
		df_irregular.at[i, 'prespart'] = df_irregular.at[i, 'lemma'][:-2] + 'ying'

	elif df_irregular.at[i, 'lemma'].endswith('e'):
		df_irregular.at[i, 'prespart'] = df_irregular.at[i, 'lemma'][:-1] + 'ing'

	else:
		try:
			if ((nsyl(df_irregular.at[i, 'lemma']) == 2) and (df_irregular.at[i, 'lemma'][-3:] in vc_endings) 
                                            and (df_irregular.at[i, 'lemma'][-1:] not in ['w', 'x'])
                                            and [i for i in d[df_irregular.at[i, 'lemma']][0] if len(str(i)) == 3][0][2] == '1'):
				df_irregular.at[i, 'prespart'] = df_irregular.at[i, 'lemma'] + 'ing'                        
			elif (nsyl(df_irregular.at[i, 'lemma']) == 1 or 2) and (df_irregular.at[i, 'lemma'][-3:] in vc_endings) and (df_irregular.at[i, 'lemma'][-1:] not in ['w', 'x']):
				df_irregular.at[i, 'prespart'] = df_irregular.at[i, 'lemma'] + df_irregular.at[i, 'lemma'][-1:] + 'ing'

			else:
				df_irregular.at[i, 'prespart'] = df_irregular.at[i, 'lemma'] + 'ing'

		except KeyError:
			df_irregular.at[i, 'prespart'] = df_irregular.at[i, 'lemma'] + 'ing' 
            
	if len(df_irregular.at[i, 'wethey_past'].split(', ')) == 3:
		tpl = df_irregular.at[i, 'wethey_past'].split(', ')
		df_irregular.at[i, 'wethey_past'] = tpl[2]
		df_irregular.at[i, 'hsi_past'] = tpl[0]
            
	elif len(df_irregular.at[i, 'wethey_past'].split(', ')) == 2:
		tpl = df_irregular.at[i, 'wethey_past'].split(', ')
		df_irregular.at[i, 'wethey_past'] = tpl[1]
		df_irregular.at[i, 'hsi_past'] = tpl[0]

	elif len(df_irregular.at[i, 'wethey_past'].split(', ')) == 1:
		df_irregular.at[i, 'hsi_past'] = df_irregular.at[i, 'wethey_past']
		
	df_irregular.at[i, 'i_past'] = df_irregular.at[i, 'wethey_past']
	df_irregular.at[i, 'you_past'] = df_irregular.at[i, 'wethey_past']
    
	if df_irregular.at[i, 'lemma'] in very_irregular:
		if df_irregular.at[i, 'lemma'] == 'be':
			df_irregular.at[i, 'i_past'] = 'was'
			df_irregular.at[i, 'i_pres'] = 'am'
			df_irregular.at[i, 'wethey_pres'] = 'are'
			df_irregular.at[i, 'you_pres'] = 'are'
			df_irregular.at[i, 'hsi_pres'] = 'is'
			df_irregular.at[i, 'prespart'] = 'being'
            
		if df_irregular.at[i, 'lemma'] == 'do':
			df_irregular.at[i, 'hsi_pres'] = 'does'
		if df_irregular.at[i, 'lemma'] == 'have':
			df_irregular.at[i, 'hsi_pres'] = 'has'
		if df_irregular.at[i, 'lemma'] ==  'may':
			df_irregular.at[i, 'hsi_pres'] = 'may'
			df_irregular.at[i, 'prespart'] = '[None]'

In [32]:
# Regular verb inflections.
for i in range(len(df_regular)):
	df_regular.at[i, 'lemma'] = verb_inflect.verb_infinitive(df_regular.at[i, 'lemma'])

	if not re.match(r'[a-z]', df_regular.at[i, 'lemma']):
		df_regular.at[i, 'lemma'] = '&&&'
	
	if df_regular.at[i, 'lemma'].endswith('ie'):
		df_regular.at[i, 'wethey_past'] = df_regular.at[i, 'lemma'] + 'd'

	elif df_regular.at[i, 'lemma'].endswith('e'):
		df_regular.at[i, 'wethey_past'] = df_regular.at[i, 'lemma'] + 'd'

	elif any(df_regular.at[i, 'lemma'].endswith(ending) for ending in cy_endings):
		df_regular.at[i, 'hsi_pres'] = df_regular.at[i, 'lemma'][:-1] + 'ied'
		df_regular.at[i, 'wethey_past'] = df_regular.at[i, 'lemma'][:-1] + 'ied'

	else:
		try:
			if ((nsyl(df_regular.at[i, 'lemma']) == 2) and (df_regular.at[i, 'lemma'][-3:] in vc_endings) 
                                            and (df_regular.at[i, 'lemma'][-1:] not in ['w', 'x'])
                                            and [i for i in d[df_regular.at[i, 'lemma']][0] if len(str(i)) == 3][0][2] == '1'):
				df_regular.at[i, 'wethey_past'] = df_regular.at[i, 'lemma'] + 'ed'
			elif (nsyl(df_regular.at[i, 'lemma']) == (1 or 2)) and (df_regular.at[i, 'lemma'][-3:] in vc_endings) and (df_regular.at[i, 'lemma'][-1:] not in ['w', 'x']):
				df_regular.at[i, 'wethey_past'] = df_regular.at[i, 'lemma'] + df_regular.at[i, 'lemma'][-1:] + 'ed'
                
			else:
				df_regular.at[i, 'wethey_past'] = df_regular.at[i, 'lemma'] + 'ed'
				df_regular.at[i, 'hsi_pres'] = df_regular.at[i, 'lemma'] + 'ed'

		except KeyError:
			df_regular.at[i, 'wethey_past'] = df_regular.at[i, 'lemma'] + 'ed' 

	df_regular.at[i, 'pastpart'] = df_regular.at[i, 'wethey_past']
	df_regular.at[i, 'i_pres'] = df_regular.at[i, 'lemma']
	df_regular.at[i, 'wethey_pres'] = df_regular.at[i, 'lemma']
	df_regular.at[i, 'you_pres'] = df_regular.at[i, 'lemma']
	
	if any(df_regular.at[i, 'lemma'].endswith(ending) for ending in es_endings):
		df_regular.at[i, 'hsi_pres'] = df_regular.at[i, 'lemma'] + 'es'

	elif any(df_regular.at[i, 'lemma'].endswith(ending) for ending in cy_endings):
		df_regular.at[i, 'hsi_pres'] = df_regular.at[i, 'lemma'][:-1] + 'ies'

	else:
		df_regular.at[i, 'hsi_pres'] = df_regular.at[i, 'lemma'] + 's'

	if df_regular.at[i, 'lemma'].endswith('ie'):
		df_regular.at[i, 'prespart'] = df_regular.at[i, 'lemma'][:-2] + 'ying'

	elif df_regular.at[i, 'lemma'].endswith('ee'):
		df_regular.at[i, 'prespart'] = df_regular.at[i, 'lemma'] + 'ing'

	elif df_regular.at[i, 'lemma'].endswith('e'):
		df_regular.at[i, 'prespart'] = df_regular.at[i, 'lemma'][:-1] + 'ing'

	else:
		try:
			if ((nsyl(df_regular.at[i, 'lemma']) == 2) and (df_regular.at[i, 'lemma'][-3:] in vc_endings) 
                                            and (df_regular.at[i, 'lemma'][-1:] not in ['w', 'x'])
                                            and [i for i in d[df_regular.at[i, 'lemma']][0] if len(str(i)) == 3][0][2] == '1'):
				df_regular.at[i, 'prespart'] = df_regular.at[i, 'lemma'] + 'ing'
			elif (nsyl(df_regular.at[i, 'lemma']) == (1 or 2)) and (df_regular.at[i, 'lemma'][-3:] in vc_endings) and (df_regular.at[i, 'lemma'][-1:] not in ['w', 'x']):
				df_regular.at[i, 'prespart'] = df_regular.at[i, 'lemma'] + df_regular.at[i, 'lemma'][-1:] + 'ing'
                
			else:
				df_regular.at[i, 'prespart'] = df_regular.at[i, 'lemma'] + 'ing'

		except KeyError:
			df_regular.at[i, 'prespart'] = df_regular.at[i, 'lemma'] + 'ing' 


	df_regular.at[i, 'i_past'] = df_regular.at[i, 'wethey_past']
	df_regular.at[i, 'you_past'] = df_regular.at[i, 'wethey_past']
	df_regular.at[i, 'hsi_past'] = df_regular.at[i, 'wethey_past']

	if df_regular.at[i, 'lemma'] == '&&&':
		df_regular = df_regular.drop(index=i)

df_regular = df_regular.drop('source', axis=1)

In [33]:
# Combine regular and irregular verb lists.
df_combined = pd.concat([df_irregular, df_regular], sort=False)

In [34]:
df_combined

Unnamed: 0,lemma,wethey_past,pastpart,i_pres,wethey_pres,you_pres,hsi_pres,prespart,i_past,you_past,hsi_past
0,alight,alit,alighted,alight,alight,alight,alights,alighting,alit,alit,alighted
1,arise,arose,arisen,arise,arise,arise,arises,arising,arose,arose,arose
2,awake,awaked,awoken,awake,awake,awake,awakes,awaking,awaked,awaked,awoke
3,be,were,been,am,are,are,is,being,was,were,was
4,bear,bore,borne,bear,bear,bear,bears,bearing,bore,bore,bore
...,...,...,...,...,...,...,...,...,...,...,...
88,visit,visited,visited,visit,visit,visit,visits,visiting,visited,visited,visited
94,yawn,yawned,yawned,yawn,yawn,yawn,yawns,yawning,yawned,yawned,yawned
95,yell,yelled,yelled,yell,yell,yell,yells,yelling,yelled,yelled,yelled
96,zip,zipped,zipped,zip,zip,zip,zips,zipping,zipped,zipped,zipped
