# Example of using the group generation functions

**Note**: must restart kernel if code in CoxeterArtinGroupGeneration.py has been updated

In [1]:
import numpy as np
import logging
from CoxeterArtinGroupGeneration import getTimestamp, DataGenerator, loadDataset, readDataset, plotFrequencies, setup_logging

In [2]:
# Enable Debugging (optional)
setup_logging(level=logging.INFO)

In [3]:
BR = "."    # break character
# get timestamp (for job)
timestamp = getTimestamp()  #format: YYYY-MM-DD

coxeterMatrix = np.array([
    [1, 3, 3],
    [3, 1, 3],
    [3, 3, 1],
])
dg = DataGenerator(coxeterMatrix, dataDir="datasets", timestamp=timestamp, BR=BR)
dg.groupName = "A2_tilde"
dg.mode = 'coxeter'
dg.timestamp = timestamp


# define word length, dataset size, splits 
min_wordLen = 6
max_wordLen =  22
fixed_wordLen = max_wordLen
dg.datasetSize = 129300  #6000 * 2
dg.train_size = 0.3
dg.setSizes(min_wordLen, max_wordLen, fixed_wordLen)

# generate folder name for dataset using dataset features (updates folderPath)
folderName = dg.generateFolderName()
print(f"Unique folder name for dataset:\n{folderName}")
# define directory path (defined via generation or manually)
trainDF, testDF = dg.makeDataset(userDatasetPath=folderName, random_state=1)

Unique folder name for dataset:
0 . A2_tilde . 'coxeter' . 6-22 . pad 22 . size 129,300 . split 30 70


[23:19:37] INFO: Word size 6  done | Time alloted 5.0000| Words Generated 6
[23:19:42] INFO: Word size 8  done | Time alloted 5.0000| Words Generated 6
[23:19:47] INFO: Word size 10 done | Time alloted 5.0000| Words Generated 42
[23:19:52] INFO: Word size 12 done | Time alloted 5.0000| Words Generated 96
[23:19:57] INFO: Word size 14 done | Time alloted 5.0000| Words Generated 366
[23:21:17] INFO: Word size 16 done | Time alloted 80.0000| Words Generated 1194
[23:22:47] INFO: Word size 18 done | Time alloted 90.0000| Words Generated 4360
[23:24:27] INFO: Word size 20 done | Time alloted 100.0000| Words Generated 14640
[23:26:17] INFO: Word size 22 done | Time alloted 109.69909954071045| Words Generated 43940


Training set size: 38790
Testing set size: 90510


# View Frequencies

In [4]:
import os
trivialWords = readDataset(os.path.join(dg.datasetPath, dg.trivialFile))
nontrivialWords = readDataset(os.path.join(dg.datasetPath, dg.nonTrivialFile))
print("Total Trivial Words:", len(trivialWords))
print("Total Words:", len(trivialWords) * 2)

Total Trivial Words: 64650
Total Words: 129300


In [5]:
plotFrequencies(trivialWords, wordType="Trivial")
plotFrequencies(nontrivialWords, wordType="Non-Trivial")

In [9]:
totalTrivial = 6 + 6 + 42 + 96 + 366 + 1194 + 4369 + 14616 + 43991
print(f"Possible Trivial Words (approx): {totalTrivial}")
print(f"Total Dataset Size: {totalTrivial * 2}")
print(43991 - 29219)    # new 22 words - old 22 words

Possible Trivial Words (approx): 64686
Total Dataset Size: 129372
14772


# Testing 

In [None]:
# not really accurate approximation function
def f(x):
    return (0.03355 * x**6
            - 2.441 * x**5
            + 74.38 * x**4
            - 1207.74 * x**3
            + 10976.86 * x**2
            - 52769.07 * x
            + 104536.18)
for x in range(6,24 + 2, 2):
    print(f"x: {x:<3} y: {f(x)}")

x: 6   y: 1197.4528000000282
x: 8   y: 8.503200000035577
x: 10  y: 41.48000000003958
x: 12  y: 124.99119999998948
x: 14  y: 364.8287999998429
x: 16  y: 1273.440799999924
x: 18  y: 4445.387200000288
x: 20  y: 14778.780000001017
x: 22  y: 44242.70719999942
x: 24  y: 117190.64080000023


In [None]:
def testAllotedTime(smallestWord=6, largestWord=30):
  for currWordSize in range(smallestWord, largestWord+2, 2):
    min_wordLen = currWordSize
    max_wordLen =  currWordSize
    fixed_wordLen = max_wordLen
    dg.datasetSize = 1000000  #6000 * 2
    dg.train_size = 0.3
    dg.setSizes(min_wordLen, max_wordLen, fixed_wordLen)

    # generate folder name for dataset using dataset features (updates folderPath)
    folderName = dg.generateFolderName()
    print(f"Unique folder name for dataset:\n{folderName}")
    # define directory path (defined via generation or manually)
    dg.makeDataset(userDatasetPath=folderName, random_state=1)

testAllotedTime(14,14)

In [4]:
#wordSizesList = [1194]
wordSizesList = [6,6,42,96,366,1194, 4077, 12800]
wordSizesList = [12800]
def tryMaxWordSize(smallestWord, largestWord, wordSizesList):
  for i, currWordSize in enumerate(range(smallestWord, largestWord+2, 2)):
    min_wordLen = currWordSize
    max_wordLen =  currWordSize
    fixed_wordLen = max_wordLen
    dg.datasetSize = wordSizesList[i] * 2
    dg.train_size = 0.5
    dg.setSizes(min_wordLen, max_wordLen, fixed_wordLen)

    # generate folder name for dataset using dataset features (updates folderPath)
    folderName = dg.generateFolderName()
    print(f"Unique folder name for dataset:\n{folderName}")
    # define directory path (defined via generation or manually)
    dg.makeDataset(userDatasetPath=folderName, random_state=1)

# test function
lengthSizeDict = {
  6: 6,
  8: 6,
  10: 42,
  12: 96,
  14: 366,
  16: 1194,
  18: 4077,
}
# time: 
tryMaxWordSize(18,18, wordSizesList)

Unique folder name for dataset:
1 . A2_tilde . 'coxeter' . 18-18 . pad 18 . size 25,600 . split 50 50


[11:15:31] INFO: Word size 18 done | Time alloted 150.0000| Words Generated 4373


Training set size: 4373
Testing set size: 4373
