scrub code of 'variables'

The use of `variable` has been replaced everywhere with `feature`, and `var` similarly with `feat`. This has one public-facing change, which is that gang effect structures now have a `features` key instead of a `vars` key. Update `Changes`.
garfieldnate · May 17, 2014 · 3b2323a · 3b2323a
1 parent d6973e7
commit 3b2323a
Show file tree

Hide file tree

Showing 12 changed files with 102 additions and 122 deletions.
diff --git a/AM.xs b/AM.xs
@@ -164,10 +164,10 @@ typedef struct AM_guts {
   AM_SHORT *lptr[4];
   AM_SUPRA *sptr[4];
 
-  /* array ref containing number of active variables in
+  /* array ref containing number of active features in
    * each lattice (currently we us four lattices)
    */
-  SV **activeVar;
+  SV **active_feats;
   /* array ref containing class labels for whole data set;
    * array index is data item index in data set.
    */
@@ -359,7 +359,7 @@ _xs_initialize(...)
   /* $self, the AM object */
   project = (HV *) SvRV(ST(0));
   /* For explanations on these, see the comments on AM_guts */
-  guts.activeVar = AvARRAY((AV *) SvRV(ST(1)));
+  guts.active_feats = AvARRAY((AV *) SvRV(ST(1)));
   guts.classes = AvARRAY((AV *) SvRV(ST(2)));
   guts.itemcontextchain = AvARRAY((AV *) SvRV(ST(3)));
   guts.itemcontextchainhead = (HV *) SvRV(ST(4));
@@ -379,7 +379,7 @@ _xs_initialize(...)
    */
 
   for (i = 0; i < 4; ++i) {
-    UV v = SvUVX(guts.activeVar[i]);
+    UV v = SvUVX(guts.active_feats[i]);
     Newz(0, guts.lptr[i], 1 << v, AM_SHORT);
     Newz(0, guts.sptr[i], 1 << (v + 1), AM_SUPRA); /* CHANGED */
     Newz(0, guts.sptr[i][0].data, 2, AM_SHORT);
@@ -401,7 +401,7 @@ _fillandcount(...)
   UV linear_flag;
   AM_GUTS *guts;
   MAGIC *mg;
-  AM_SHORT activeVar[4];
+  AM_SHORT active_feats[4];
   AM_SHORT **lptr;
   AM_SUPRA **sptr;
   AM_SHORT nptr[4];/* this helps us manage the free list in sptr[i] */
@@ -437,11 +437,11 @@ _fillandcount(...)
   lptr = guts->lptr;
   sptr = guts->sptr;
   for (chunk = 0; chunk < 4; ++chunk) {
-    activeVar[chunk] = (AM_SHORT) SvUVX(guts->activeVar[chunk]);
-    Zero(lptr[chunk], 1 << activeVar[chunk], AM_SHORT);
+    active_feats[chunk] = (AM_SHORT) SvUVX(guts->active_feats[chunk]);
+    Zero(lptr[chunk], 1 << active_feats[chunk], AM_SHORT);
     sptr[chunk][0].next = 0;
     nptr[chunk] = 1;
-    for (i = 1; i < 1 << (activeVar[chunk] + 1); ++i) /* CHANGED */
+    for (i = 1; i < 1 << (active_feats[chunk] + 1); ++i) /* CHANGED */
       sptr[chunk][i].next = (AM_SHORT) i + 1;
   }
 
@@ -476,7 +476,7 @@ _fillandcount(...)
     AM_SHORT *contextptr = (AM_SHORT *) HeKEY(he);
     AM_SHORT class = (AM_SHORT) SvUVX(HeVAL(he));
     for (chunk = 0; chunk < 4; ++chunk, ++contextptr) {
-      AM_SHORT active = activeVar[chunk];
+      AM_SHORT active = active_feats[chunk];
       AM_SHORT *lattice = lptr[chunk];
       AM_SUPRA *supralist = sptr[chunk];
       AM_SHORT nextsupra = nptr[chunk];

diff --git a/Changes b/Changes
@@ -2,6 +2,7 @@ Revision history for AM-Algorithm
 
 {{$NEXT}}
     test_in_data in Result changed to test_in_train
+    'vars' in gang_effects return structure changed to 'features'
 3.01      2014-05-15 21:18:29 Asia/Seoul
     Gang and analogical set structures changed to contain Item objects
     Unique id field added to Item

diff --git a/TODO.txt b/TODO.txt
@@ -46,18 +46,11 @@ Add other bigint helper methods into BigInt and test them, as well.
 
 Instead of passing around the "activeVars" variable and skipping nulls if needed, it might make more sense to create an array containing all of the active indices, i.e. [0,1,2,4,6,7] if 3 and 5 are null and exclude nulls is on. This would make it easier to simplify things via `map` or whatever.
 
-Scrub the code for consistent naming. "Data" should be given, or training, or exemplar. Everything is data, so why call anything "data"?
-    - class (X)
-    - exemplar (X)
-    - spec (X)
-    - data (X)
-    - variables ()
-
 do something better than just calling rand() for the probability/skip function.
 
 Move the XS code to a different package so all of those variables can be stored in something private: pointers, itemcontextchainhead, etc. The work done on gangs could also be put into this package, since it requires special knowledge of the underlying structure. Maybe the Guts package?
 
-change activeVars to be accepted at classification time
+change active_features to be accepted at classification time
 
 properly destroy project or AM object on error so that illegal state is not possible even if someone catches an error and tries to continue.
 

diff --git a/lib/Algorithm/AM.pm b/lib/Algorithm/AM.pm
@@ -55,9 +55,9 @@ sub _initialize {
     my ($self) = @_;
 
     my $train = $self->training_set;
-    # compute activeVars here so that lattice space can be allocated in the
+    # compute active_feats here so that lattice space can be allocated in the
     # _initialize method
-    $self->{activeVars} = _compute_lattice_sizes($train->cardinality);
+    $self->{active_feats} = _compute_lattice_sizes($train->cardinality);
 
     # sum is intitialized to a list of zeros
     @{$self->{sum}} = (0.0) x ($train->num_classes + 1);
@@ -81,7 +81,7 @@ sub _initialize {
     # must not be increasing the reference count
     $self->{save_this} = $train->_data_classes;
     $self->_xs_initialize(
-        $self->{activeVars},
+        $self->{active_feats},
         $self->{save_this},
         $self->{itemcontextchain},
         $self->{itemcontextchainhead},
@@ -104,28 +104,28 @@ sub classify {
                 $test_item->cardinality . ')';
     }
 
-    # num_variables is the number of active variables; if we
-    # exclude nulls, then we need to minus the number of '=' found in
-    # this test item; otherwise, it's just the number of columns in a
-    # single item vector
-    my $num_variables = $training_set->cardinality;
+    # num_feats is the number of features to be used in classification;
+    # if we exclude nulls, then we need to minus the number of '='
+    # found in this test item; otherwise, it's just the number of
+    # columns in a single item vector
+    my $num_feats = $training_set->cardinality;
 
     if($self->exclude_nulls){
-        $num_variables -= grep {$_ eq ''} @{
+        $num_feats -= grep {$_ eq ''} @{
             $test_item->features };
     }
 
-    # recalculate the lattice sizes with new number of active variables;
-    # must edit activeVars instead of assigning it a new arrayref because
+    # recalculate the lattice sizes with new number of active features;
+    # must edit active_feats instead of assigning it a new arrayref because
     # the XS code only has the existing arrayref and will not be given
-    # a new one. This must be done for every test item because activeVars
+    # a new one. This must be done for every test item because active_feats
     # is a global that could have been edited during classification of the
     # last test item.
-    # TODO: pass activeVars into fill_and_count instead of doing this
+    # TODO: pass active_feats into fill_and_count instead of doing this
     {
-        my $lattice_sizes = _compute_lattice_sizes($num_variables);
+        my $lattice_sizes = _compute_lattice_sizes($num_feats);
         for(0 .. $#$lattice_sizes){
-            $self->{activeVars}->[$_] = $lattice_sizes->[$_];
+            $self->{active_feats}->[$_] = $lattice_sizes->[$_];
         }
     }
 ##  $activeContexts = 1 << $activeVar;
@@ -159,7 +159,7 @@ sub classify {
         my $context = _context_label(
             # Note: this must be copied to prevent infinite loop;
             # see todo note for _context_label
-            [@{$self->{activeVars}}],
+            [@{$self->{active_feats}}],
             $training_set->get_item($index)->features,
             $test_item->features,
             $self->exclude_nulls
@@ -198,7 +198,7 @@ sub classify {
     # info.
     my $result = Algorithm::AM::Result->new(
         given_excluded => $given_excluded,
-        cardinality => $num_variables,
+        cardinality => $num_feats,
         exclude_nulls => $self->exclude_nulls,
         count_method => $self->linear ? 'linear' : 'squared',
         training_set => $training_set,
@@ -231,28 +231,28 @@ sub classify {
         $self->{itemcontextchain},
         $self->{context_to_class},
         $self->{gang},
-        $self->{activeVars},
+        $self->{active_feats},
         $self->{contextsize}
     );
     return $result;
 }
 
-# since we split the lattice in four, we have to decide which variables
-# go where. Given the number of variables being used, return an arrayref
-# containing the number of variables to be used in each of the the four
+# since we split the lattice in four, we have to decide which features
+# go where. Given the number of features being used, return an arrayref
+# containing the number of features to be used in each of the the four
 # lattices.
 sub _compute_lattice_sizes {
     my ($num_feats) = @_;
 
     use integer;
-    my @active_vars;
+    my @active_feats;
     my $half = $num_feats / 2;
-    $active_vars[0] = $half / 2;
-    $active_vars[1] = $half - $active_vars[0];
+    $active_feats[0] = $half / 2;
+    $active_feats[1] = $half - $active_feats[0];
     $half         = $num_feats - $half;
-    $active_vars[2] = $half / 2;
-    $active_vars[3] = $half - $active_vars[2];
-    return \@active_vars;
+    $active_feats[2] = $half / 2;
+    $active_feats[3] = $half - $active_feats[2];
+    return \@active_feats;
 }
 
 # Create binary context labels for a training item
@@ -262,28 +262,28 @@ sub _compute_lattice_sizes {
 # single scalar representing an array of 4 shorts (this
 # format is used in the XS side).
 
-# TODO: we have to copy activeVars out of $self in order to
+# TODO: we have to copy active_feats out of $self in order to
 # iterate it. Otherwise it goes on forever. Why?
 sub _context_label {
     # inputs:
-    # number of active variables in each lattice,
+    # number of active features in each lattice,
     # training item features, test item features,
     # and boolean indicating if nulls should be excluded
-    my ($active_vars, $train_feats, $test_feats, $skip_nulls) = @_;
+    my ($active_feats, $train_feats, $test_feats, $skip_nulls) = @_;
 
-    # variable index
+    # feature index
     my $index        = 0;
     # the binary context labels for each separate lattice
     my @context_list    = ();
 
-    for my $a (@$active_vars) {
+    for my $a (@$active_feats) {
         # binary context label for a single sublattice
         my $context = 0;
-        # loop through all variables in the sublattice
-        # assign 0 if variables match, 1 if they do not
+        # loop through all features in the sublattice
+        # assign 0 if features match, 1 if they do not
         for ( ; $a ; --$a ) {
 
-            # skip null variables if indicated
+            # skip null features if indicated
             if($skip_nulls){
                 ++$index while $test_feats->[$index] eq '';
             }
@@ -423,7 +423,7 @@ while the parsing, printing, and statistical routines remained in C;
 this was accomplished by embedding a Perl interpreter into the C code.
 
 In 2004, the algorithm was again rewritten, this time in order to
-handle more variables and large data sets. The algorithm breaks the
+handle more features and large data sets. The algorithm breaks the
 supracontextual lattice into the direct product of four smaller ones,
 which the algorithm manipulates individually before recombining.
 These lattices can be manipulated in parallel when using the right

diff --git a/lib/Algorithm/AM/DataSet.pm b/lib/Algorithm/AM/DataSet.pm
@@ -49,10 +49,9 @@ sub new {
     return $self;
 }
 
-# check the project path and the options for validity
-# Return an option hash to initialize $self with, containing the
-# project path object, number of variables, and field_sep and var_sep,
-# which are used to parse lines of data
+# check the options for validity
+# Return an option hash to initialize $self with
+# For now only 'cardinality' is allowed/required.
 sub _check_opts {
     my (%opts) = @_;
 
@@ -142,7 +141,7 @@ sub add_item {
 
     if($self->cardinality != $item->cardinality){
         croak 'Expected ' . $self->cardinality .
-            ' variables, but found ' . (scalar $item->cardinality) .
+            ' features, but found ' . (scalar $item->cardinality) .
             ' in ' . (join ' ', @{$item->features}) .
             ' (' . $item->comment . ')';
     }
@@ -266,12 +265,12 @@ sub dataset_from_file {## no critic (RequireArgUnpacking)
     if($format eq 'commas'){
         # class/features/comment separated by a comma
         $field_sep   = qr{\s*,\s*};
-        # variables separated by space
+        # features separated by space
         $feature_sep = qr{\s+};
     }elsif($format eq 'nocommas'){
         # class/features/comment separated by space
         $field_sep   = qr{\s+};
-        # no seps for variables; each is a single character
+        # no seps for features; each is a single character
         $feature_sep = qr{};
     }else{
         croak "Unknown value $format for format parameter " .
@@ -330,7 +329,7 @@ sub _read_data_sub {
         }
 
         my @data_vars = split /$feature_sep/, $feats;
-        # set unknown variables to ''
+        # set null features to ''
         @data_vars = map {$_ eq $null ? '' : $_} @data_vars;
 
         return Algorithm::AM::DataSet::Item->new(