Sorry, not an issue. I've commented distance.c so that everyone can see how it works. Enjoy, and feel free to correct/improve it! #5

GoogleCodeExporter · 2015-09-28T17:29:30Z

If, after making needed corrections, this could be added to the source code, I 
think future users would appreciate this. Thanks. --Gregg Williams

--- begin distance.c ---
//  Copyright 2013 Google Inc. All Rights Reserved.
//
//  Licensed under the Apache License, Version 2.0 (the "License");
//  you may not use this file except in compliance with the License.
//  You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS,
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//  See the License for the specific language governing permissions and
//  limitations under the License.

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>

const long long max_size = 2000;         // max length of strings
const long long N = 40;                  // number of closest words that will 
be shown
const long long max_w = 50;              // max length of vocabulary entries

int main(int argc, char **argv) {
  FILE *f;
  char st1[max_size];
  char bestw[N][max_size];
  char file_name[max_size], st[100][max_size];
  float dist, len, bestd[N], vec[max_size];
  long long words, size, a, b, c, d, cn, bi[100];

  char ch;
  float *M;
  char *vocab;
  if (argc < 2) {
    printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
    return 0;
  }
  strcpy(file_name, argv[1]);
  f = fopen(file_name, "rb");
  if (f == NULL) {
    printf("Input file not found\n");
    return -1;
  }
  // words = number of words in file
  fscanf(f, "%lld", &words);

  // size = number of floating-point values associated with each word in the "dictionary"
  fscanf(f, "%lld", &size);

  // vocab points to a list of all the words in the "dictionary". Words are stored in fixed-width substrings;
  //    each word is allotted max_w bytes.
  vocab = (char *)malloc((long long)words * max_w * sizeof(char));

  // SUMMARY: M contains 'size' (an integer) floats for each word in the "dictionary".
  // M points to a vector of (words * size) floats, stored linearly. Floats 0 through (size - 1) correspond
  //     to word 0, floats size through (2 * size - 1) correspond to word 1, etc.
  M = (float *)malloc((long long)words * (long long)size * sizeof(float));
  if (M == NULL) {
    printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
    return -1;
  }

  for (b = 0; b < words; b++) {

    // Reads one entry from input file f, which corresponds to one word of the "dictionary" of words contained in f;
    // this word is stored in the specific substring of vocab reserved for it.
    // UNSURE WHAT PURPOSE SERVED by the char pointed to by &ch--maybe a NUL char denoting end-of-string?
    fscanf(f, "%s%c", &vocab[b * max_w], &ch);

    // Reads 'size' floats, corresponding to the word with index b, into array M.
    for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);

    // len = sqrt (sum of [each entry in M] ** 2 ) -- a normalizing factor
    len = 0;
    for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
    len = sqrt(len);

    // Each entry in M is normalized by a factor of 'len'.
    for (a = 0; a < size; a++) M[a + b * size] /= len;
  }
  fclose(f);

  // **********************************************
  // ****  beginning of user-interaction loop  ****
  // **********************************************
  while (1) {
    for (a = 0; a < N; a++) bestd[a] = 0;
    for (a = 0; a < N; a++) bestw[a][0] = 0;
    printf("Enter word or sentence (EXIT to break): ");

    // st1 contains a input text from stdin (usually the console)
    a = 0;
    while (1) {
      st1[a] = fgetc(stdin);
      if ((st1[a] == '\n') || (a >= max_size - 1)) {
        st1[a] = 0;
        break;
      }
      a++;
    }

    // End program loop if input text = "EXIT".
    if (!strcmp(st1, "EXIT")) break;

    // st[0] is a zero-terminated character array representing the input text.
    cn = 0;
    b = 0;
    c = 0;
    while (1) {
      st[cn][b] = st1[c];
      b++;
      c++;
      st[cn][b] = 0;
      if (st1[c] == 0) break;
      if (st1[c] == ' ') {
        cn++;
        b = 0;
        c++;
      }
    }
    cn++;
    // cn = number of words (separated by a space) in the input text
    // st = an array of strings: st[0][] is the first word of the input text; st[1][] is the second word, etc.

    // This loop either finds each word within the input text in the 'vocab' string, or it signals 
    //     b = -1 if at least one word is not found. If a word is found, b is the index to it in 'vocab'.
    // bi[0] = the index of the first word in the input text; bi[1] = the index of the second word, etc.;
    //     bi[k] = -1 signals no more words--i.e., there are k words in the input text.
    // For each word in the input text, the word and its position in the "dictionary" is printed.
    for (a = 0; a < cn; a++) {
      // 
      for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
      if (b == words) b = -1;
      bi[a] = b;
      printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
      if (b == -1) {
        printf("Out of dictionary word!\n");
        break;
      }
    }

    // Ff input text is not found, restart user-interaction loop.
    if (b == -1) continue;

    // Reminder:
    //     st points to the words in the input text (there are cn of them)
    //     bi[k] is the index of word st[k] within the 'vocab' string
    //     M gives the precalculated "cosine distance" value for the word at st[k][].

    // The code below finds and prints the N "closest" words to the input and their 
    //     "similarity" values (which are always < 1.0)--larger values are "closer".
    printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
    // vec contains the 'size' floating-point values associated with the input text.
    // NOTE: if the input text contains multiple words, the value of each element in vec is 
    //       the SUM of the corresponding float values for each of the words in the input text.
    for (a = 0; a < size; a++) vec[a] = 0;
    for (b = 0; b < cn; b++) {
      if (bi[b] == -1) continue;
      // vec contains the 'size' vectors associated with the bi[b]-th word in the "dictionary".
      for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
    }

    // len = sqrt (sum of the squares of each vector element within vec)
    // Each element in vec is normalized by dividing it by 'len'.
    len = 0;
    for (a = 0; a < size; a++) len += vec[a] * vec[a];
    len = sqrt(len);
    for (a = 0; a < size; a++) vec[a] /= len;

    // Arrays bestd and bestw are associated with the list of the N words that are "closest"
    //     to the word(s) in the input text
    // For an index i, bestw[i][] points to the word in that slot,
    //                 bestd[i]   points to the word's "distance" value.
    for (a = 0; a < N; a++) bestd[a] = 0;
    for (a = 0; a < N; a++) bestw[a][0] = 0;

    // For each word in "dictionary"....    (in loop, c is the index of the word being tested)
    for (c = 0; c < words; c++) {
        // a is set to 1 if any of the words in the input text is the word being tested.
      a = 0;
      for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
      if (a == 1) continue;
      // The following executes only if the word being tested is NOT in the input text.
      dist = 0;
      // dist = sum of the 'size' the float values associated with the word being tested
      for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
      // for each of the N slots that will eventually hold the N "closest" words...
      for (a = 0; a < N; a++) {
          // if the "distance" of word c is greater than the "distance" of the current slot (slot a),
          //     move all the bestd and bestw entries one entry closer to the end of the list (losing
          //     the "worst" entry) and insert the bestd and bestw entries for the current word (c)
          //     into the current slot (a).
        if (dist > bestd[a]) {
          for (d = N - 1; d > a; d--) {
            bestd[d] = bestd[d - 1];
            strcpy(bestw[d], bestw[d - 1]);
          }
          bestd[a] = dist;
          strcpy(bestw[a], &vocab[c * max_w]);
          break;
        }
      }
    }
    // From "best" to "worst", print each word and its "distance" value.
    for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
  }
  return 0;
}
---  end distance.c  ---

Original issue reported on code.google.com by GreggInCA@gmail.com on 22 Aug 2013 at 6:40

The text was updated successfully, but these errors were encountered:

GoogleCodeExporter added Priority-Medium Type-Defect auto-migrated labels Sep 28, 2015

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Sorry, not an issue. I've commented distance.c so that everyone can see how it works. Enjoy, and feel free to correct/improve it! #5

Sorry, not an issue. I've commented distance.c so that everyone can see how it works. Enjoy, and feel free to correct/improve it! #5

GoogleCodeExporter commented Sep 28, 2015

Sorry, not an issue. I've commented distance.c so that everyone can see how it works. Enjoy, and feel free to correct/improve it! #5

Sorry, not an issue. I've commented distance.c so that everyone can see how it works. Enjoy, and feel free to correct/improve it! #5

Comments

GoogleCodeExporter commented Sep 28, 2015