In [80]:
# goal - generate a list of inflected forms based on dpd.csv + declensions & conjugations.xlsx

import pandas as pd

reduce dpd to headword, stem, pattern

In [81]:
headwords_df = pd.read_csv("/home/bhikkhu/Bodhirasa/Dropbox/dpd/csvs/dpd.csv", sep="\t", dtype=str)
headwords_df.fillna("", inplace=True)

headwords_df = headwords_df[["Pāli1", "Stem", "Pattern"]]
headwords_df.to_csv("data/headwords.csv", sep="\t", index=False)
headwords_df

Unnamed: 0,Pāli1,Stem,Pattern
0,ā,-,
1,a 1,-,
2,a 2,-,
3,a 3,-,
4,a 4,-,
...,...,...,...
44807,sotindriyasaṃvara,sotindriyasaṃvar,a masc
44808,ghānindriyasaṃvara,ghānindriyasaṃvar,a masc
44809,jivhindriyasaṃvara,jivhindriyasaṃvar,a masc
44810,kāyindriyasaṃvara,kāyindriyasaṃvar,a masc


read in inflections table

In [82]:
inflection_df = pd.read_excel('/home/bhikkhu/Bodhirasa/Dropbox/dpd/inflection-generator/declensions & conjugations.xlsx', sheet_name="declensions", dtype=str)

inflection_df = inflection_df.shift(periods=2)

inflection_df.columns = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "AA", "AB", "AC", "AD", "AE", "AF", "AG", "AH", "AI", "AJ", "AK", "AL", "AM", "AN", "AO", "AP", "AQ", "AR", "AS", "AT", "AU", "AV", "AW", "AX", "AY", "AZ", "BA", "BB", "BC", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BK", "BL", "BM", "BN", "BO", "BP", "BQ", "BR", "BS", "BT", "BU", "BV", "BW", "BX", "BY", "BZ", "CA", "CB", "CC", "CD", "CE", "CF", "CG", "CH", "CI", "CJ", "CK", "CL", "CM", "CN", "CO", "CP", "CQ", "CR", "CS", "CT", "CU", "CV", "CW", "CX", "CY", "CZ", "DA", "DB", "DC", "DD", "DE", "DF", "DG", "DH", "DI", "DJ", "DK"]
inflection_df.fillna("", inplace=True)
inflection_df.to_csv("data/inflection_df.csv", sep="\t", index=False)
inflection_df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,DB,DC,DD,DE,DF,DG,DH,DI,DJ,DK
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,a adj,masc sg,,masc pl,,fem sg,,fem pl,,neut sg,...,,,,,,,,,,
4,nom,o,masc nom sg,ā\nāse,masc nom pl,ā,fem nom sg,ā\nāyo,fem nom pl,aṃ,...,pl,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,,,,,,,,,,,...,,,,,,,,,,
261,,,,,,,,,,,...,,,,,,,,,,
262,,,,,,,,,,,...,,,,,,,,,,
263,,,,,,,,,,,...,,,,,,,,,,


read index

In [83]:
index_df = pd.read_excel('/home/bhikkhu/Bodhirasa/Dropbox/dpd/inflection-generator/declensions & conjugations.xlsx', sheet_name="index", dtype=str)
index_df.fillna("", inplace=True)
index_df_length = len(index_df)
index_df.to_csv("data/index.csv", sep="\t", index=False)
index_df

Unnamed: 0,inflection name,cell range,like,irreg,aka,aka cell range
0,a adj,A3:M12,dīgha,,,
1,ī adj,A14:M23,ātāpī,,in adj,A25:M34
2,ant adj,A36:M45,sutavant,,antu adj,A47:M56
3,u adj,A58:M67,bahu,,,
4,i adj,A69:M78,pūti,,,
...,...,...,...,...,...,...
137,pivi aor,CS157:CW161,,irreg,,
138,eti pr 2,CS163:DA176,,irreg,,
139,nāssa opt,CS178:CW182,,irreg,,
140,kayirā opt,CS184:DA188,,irreg,,


write inflection patterns to csv

In [84]:
import re

for row in range(index_df_length):
	inflection_name = index_df.iloc[row,0]
	cell_range = index_df.iloc[row,1]
	like = index_df.iloc[row,2]
	irreg = index_df.iloc[row,3]
	
	col_range_1 = re.sub("(.+?)\d*\:.+", "\\1", cell_range)
	col_range_2 = re.sub(".+\:(.[A-Z]*)\d*", "\\1", cell_range)
	row_range_1 = int(re.sub(".+?(\d{1,3}):.+", "\\1", cell_range))
	row_range_2 = int(re.sub(".+:.+?(\d{1,3})", "\\1", cell_range))

	# print (f"{inflection_name} || {cell_range} || {col_range_1}:{col_range_2} || {row_range_1}:{row_range_2}")

	inflection_df_filtered = inflection_df.loc[row_range_1:row_range_2, col_range_1:col_range_2]
	inflection_df_filtered.to_csv(f"data/df/{inflection_name}.csv", sep="\t", index=False, header=False)

generate csv for each headword

In [85]:
headwords_df_len = len(headwords_df)

for row in range(headwords_df_len): #headwords_df_len
	headword = headwords_df.iloc[row, 0]
	stem = headwords_df.iloc[row, 1]
	pattern = headwords_df.iloc[row, 2]
	
	if stem == "-":
		headword_clean = re.sub(" \d*$", "", headword)
		with open(f'data/csv/{headword}.csv', 'w') as text_file:
			text_file.write(f"ind\t{headword_clean}")
		pass
	elif stem == "!":
		headword_clean = re.sub(" \d*$", "", headword)
		with open(f'data/csv/{headword}.csv', 'w') as text_file:
			text_file.write(f"ind\t{headword_clean}")
		pass
	elif stem == "*":
		headword_clean = re.sub(" \d*$", "", headword)
		df = pd.read_csv(f"data/df/{pattern}.csv", sep="\t", header=None)
		df.iloc[0,0] = headword_clean
		df.to_csv(f"data/csv/{headword}.csv", sep="\t", header=False, index=False)
		pass
	else:
		headword_clean = re.sub(" \d*$", "", headword)
		df = pd.read_csv(f"data/df/{pattern}.csv", sep="\t", header=None)
		df.fillna("", inplace=True)
		df_rows = df.shape[0]
		df_columns = df.shape[1]

		if df_columns == 13:
			for rows in range(1, df_rows):
				for columns in range(1, df_columns, 2):
					df.iloc[rows, columns] = re.sub(r"(.+)", f"{stem}\\1", df.iloc[rows, columns])

			df.iloc[0,0] = f"{headword} {pattern}"
			df.to_csv(f"data/csv/{headword}.csv", sep="\t", index=False, header=False)

		elif df_columns == 9:
			for rows in range(2, df_rows):
				for columns in range(1, df_columns, 2):
					df.iloc[rows, columns] = re.sub(r"(.+)", f"{stem}\\1", df.iloc[rows, columns])

			df.iloc[0,0] = f"{headword} {pattern}"
			df.to_csv(f"data/csv/{headword}.csv", sep="\t", index=False, header=False)

		elif df_columns == 7:
			pass

		elif df_columns == 5 and df_rows == 10:
			for rows in range(1, df_rows):
				for columns in range(1, df_columns, 2):
					df.iloc[rows, columns] = re.sub(r"(.+)", f"{stem}\\1", df.iloc[rows, columns])

			df.iloc[0,0] = f"{headword} {pattern}"
			df.to_csv(f"data/csv/{headword}.csv", sep="\t", index=False, header=False)

		elif df_columns == 5:
			if stem == "hoti pr":
				for rows in range(2, df_rows):
					for columns in range(1, df_columns, 2):
						df.iloc[rows, columns] = re.sub(r"(.+)", f"{stem}\\1", df.iloc[rows, columns])

				df.iloc[0,0] = f"{headword} {pattern}"
				df.to_csv(f"data/csv/{headword}.csv", sep="\t", index=False, header=False)
			
			else:
				for rows in range(1, df_rows):
					for columns in range(1, df_columns, 2):
						df.iloc[rows, columns] = re.sub(r"(.+)", f"{stem}\\1", df.iloc[rows, columns])

				df.iloc[0,0] = f"{headword} {pattern}"		
				df.to_csv(f"data/csv/{headword}.csv", sep="\t", index=False, header=False)
			

		elif df_columns == 3:
			for rows in range(1,df_rows):
				for columns in range(1, df_columns, 2):
					df.iloc[rows, columns] = re.sub(r"(.+)", f"{stem}\\1", df.iloc[rows, columns])

			df.iloc[0,0] = f"{headword} {pattern}"
			df.to_csv(f"data/csv/{headword}.csv", sep="\t", index=False, header=False)
		
		else:
			print(f"{headword} ({df_rows}x{df_columns})")

FileNotFoundError: [Errno 2] No such file or directory: 'data/df/a fem.csv'

generate df of all inflections with meanings

In [86]:
dpd_df = pd.read_csv("/home/bhikkhu/Bodhirasa/Dropbox/dpd/csvs/dpd.csv", sep="\t", dtype=str)
dpd_df.fillna("", inplace=True)
dpd_df_length = len(dpd_df)


all_inflections = ""

for row in range(dpd_df_length): #dpd_df_length
	headword = dpd_df.loc[row, "Pāli1"]
	headword_clean = re.sub(" \d*$", "", headword)
	stem = dpd_df.loc[row, "Stem"]
	pattern = dpd_df.loc[row, "Pattern"]
	pos = dpd_df.loc[row, "POS"]
	metadata = dpd_df.loc[row, "Metadata"]
	meaning = dpd_df.loc[row, "Meaning IN CONTEXT"]
	

	if pos != "prefix" and pos != "abbrev" and pos != "cs" and pos != "letter" and pos != "root" and pos != "suffix" and pos != "ve" and pos != "idiom" and meaning != "" and metadata != "yes":
		# print(f"{headword=} {headword_clean=} {stem=} {pattern=} {pos=} {metadata=} {meaning=}")

		if stem == "-":
			all_inflections += headword_clean

		elif stem == "!":
			all_inflections += headword_clean

		else:
			try:
				df = pd.read_csv(f"data/csv/{headword}.csv", sep="\t")
				df.fillna("", inplace=True)

				df_rows = df.shape[0]
				df_columns = df.shape[1]
				word_inflections = ""
				
				for rows in range(0, df_rows):
					for columns in range(1, df_columns, 2):

						line = df.iloc[rows, columns]
						search_string = re.compile("\n", re.M)
						replace_string = " "
						matches = re.sub(search_string, replace_string, line)

						word_inflections += matches + " "

				all_inflections += word_inflections
			
			except:
				print(f"error: {headword}")

all_inflections_list = all_inflections.split()
all_inflections_list = list(dict.fromkeys(all_inflections_list))
all_inflections_df = pd.DataFrame(all_inflections_list)
all_inflections_df.to_csv("data/all_inflections_df.csv", index=False, header=False)


error: dvatti
error: dveti
error: kāraṇadesanā
error: paṭhamanayabhūmipariccheda
error: dutiyanayabhūmipariccheda
error: tatiyanayabhūmipariccheda
error: catutthanayabhūmipariccheda
error: pañcamanayabhūmipariccheda
error: chaṭṭhanayabhūmipariccheda
error: sattamanayabhūmipariccheda
error: aṭṭhamanayabhūmipariccheda
error: paṭhamanaya
error: dutiyanaya
error: tatiyanaya
error: catutthanaya
error: pañcamanaya
error: chaṭṭhanaya
error: sattamanaya
error: aṭṭhamanaya
error: sabbāsava
error: sabbāsavasutta
error: sabbāsavasaṃvarapariyāya
error: pahātabbāsava
error: diṭṭhisaṃyojanasaṃyutta
error: cakkhundriyasaṃvarasaṃvuta
error: sotindriyasaṃvarasaṃvuta
error: ghānindriyasaṃvarasaṃvuta
error: jivhindriyasaṃvarasaṃvuta
error: kāyindriyasaṃvarasaṃvuta
error: manindriyasaṃvarasaṃvuta
error: cakkhundriyasaṃvara
error: sotindriyasaṃvara
error: ghānindriyasaṃvara
error: jivhindriyasaṃvara
error: kāyindriyasaṃvara
error: manindriyasaṃvara
