# Example of using the group generation functions

**Note**: must restart kernel if code in CoxeterArtinGroupGeneration.py has been updated

In [1]:
import numpy as np
import logging
from CoxeterArtinGroupGeneration import getTimestamp, DataGenerator, loadDataset, readDataset, plotFrequencies, setup_logging

In [2]:
# Enable Debugging (optional)
setup_logging(level=logging.INFO)

In [3]:
BR = "."    # break character
# get timestamp (for job)
timestamp = getTimestamp()  #format: YYYY-MM-DD

coxeterMatrix = np.array([
    [1, 3, 3],
    [3, 1, 3],
    [3, 3, 1],
])
dg = DataGenerator(coxeterMatrix, dataDir="datasets", timestamp=timestamp, BR=BR)
dg.groupName = "A2_tilde"
dg.mode = 'coxeter'
dg.timestamp = timestamp


# define word length, dataset size, splits 
min_wordLen = 6
max_wordLen =  22
fixed_wordLen = max_wordLen
dg.datasetSize = 100000  #6000 * 2
dg.train_size = 0.3
dg.setSizes(min_wordLen, max_wordLen, fixed_wordLen)

# generate folder name for dataset using dataset features (updates folderPath)
folderName = dg.generateFolderName()
print(f"Unique folder name for dataset:\n{folderName}")
# define directory path (defined via generation or manually)
trainDF, testDF = dg.makeDataset(userDatasetPath=folderName, random_state=1)

Unique folder name for dataset:
0 . A2_tilde . 'coxeter' . 6-22 . pad 22 . size 100,000 . split 30 70


[14:14:48] INFO: Word size 6  done | Time alloted 30.0000| Words Generated 6
[14:15:28] INFO: Word size 8  done | Time alloted 40.0000| Words Generated 6
[14:16:18] INFO: Word size 10 done | Time alloted 50.0000| Words Generated 42
[14:17:18] INFO: Word size 12 done | Time alloted 60.0000| Words Generated 96
[14:18:28] INFO: Word size 14 done | Time alloted 70.0000| Words Generated 366
[14:19:48] INFO: Word size 16 done | Time alloted 80.0000| Words Generated 1194
[14:21:18] INFO: Word size 18 done | Time alloted 90.0000| Words Generated 4378
[14:22:58] INFO: Word size 20 done | Time alloted 100.0000| Words Generated 14693
[14:23:10] INFO: Word size 22 done | Time alloted 12.2347571849823| Words Generated 29219


Training set size: 30000
Testing set size: 70000


# View Frequencies

In [4]:
import os
trivialWords = readDataset(os.path.join(dg.datasetPath, dg.trivialFile))
nontrivialWords = readDataset(os.path.join(dg.datasetPath, dg.nonTrivialFile))
print("Total Trivial Words:", len(trivialWords))
print("Total Words:", len(trivialWords) * 2)

Total Trivial Words: 50000
Total Words: 100000


In [5]:
plotFrequencies(trivialWords, wordType="Trivial")
plotFrequencies(nontrivialWords, wordType="Non-Trivial")

# Testing 

In [None]:
def testAllotedTime(smallestWord=6, largestWord=30):
  for currWordSize in range(smallestWord, largestWord+2, 2):
    min_wordLen = currWordSize
    max_wordLen =  currWordSize
    fixed_wordLen = max_wordLen
    dg.datasetSize = 1000000  #6000 * 2
    dg.train_size = 0.3
    dg.setSizes(min_wordLen, max_wordLen, fixed_wordLen)

    # generate folder name for dataset using dataset features (updates folderPath)
    folderName = dg.generateFolderName()
    print(f"Unique folder name for dataset:\n{folderName}")
    # define directory path (defined via generation or manually)
    dg.makeDataset(userDatasetPath=folderName, random_state=1)

testAllotedTime(14,14)

In [4]:
#wordSizesList = [1194]
wordSizesList = [6,6,42,96,366,1194, 4077, 12800]
wordSizesList = [12800]
def tryMaxWordSize(smallestWord, largestWord, wordSizesList):
  for i, currWordSize in enumerate(range(smallestWord, largestWord+2, 2)):
    min_wordLen = currWordSize
    max_wordLen =  currWordSize
    fixed_wordLen = max_wordLen
    dg.datasetSize = wordSizesList[i] * 2
    dg.train_size = 0.5
    dg.setSizes(min_wordLen, max_wordLen, fixed_wordLen)

    # generate folder name for dataset using dataset features (updates folderPath)
    folderName = dg.generateFolderName()
    print(f"Unique folder name for dataset:\n{folderName}")
    # define directory path (defined via generation or manually)
    dg.makeDataset(userDatasetPath=folderName, random_state=1)

# test function
lengthSizeDict = {
  6: 6,
  8: 6,
  10: 42,
  12: 96,
  14: 366,
  16: 1194,
  18: 4077,
}
# time: 
tryMaxWordSize(18,18, wordSizesList)

Unique folder name for dataset:
1 . A2_tilde . 'coxeter' . 18-18 . pad 18 . size 25,600 . split 50 50


[11:15:31] INFO: Word size 18 done | Time alloted 150.0000| Words Generated 4373


Training set size: 4373
Testing set size: 4373
