-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Closed
Labels
Description
by serge.hulne:
Here is the exact same program writtem in Python, in D and in Go. It is used to sort a relatively large text file which is made by concatenating Shakespeare plays (downloaded from Gutenberg.com) The program extracts words form said text using a trivial tokenizer (the same algorithm in all three cases), stores them is a hash map to keep only one exemplar of each word and increments a counter, every time a word which is already in said map is detected again, hence counting the frequency of occurence of words in the text. The pair (frequency, word) are stored in an array which is sorted according to frequency in order to display a list of the word appearing the most frequently in the text. The problem ----------- The Go version uses about 90 megabites of RAM whereas the two other versions use about 8 megabytes of RAM to do the exact same job. So, basically Go seems to use up 10 times more RAM than Python (or D) for that kind od task. Here are the sources: --------------------- 1) Python: ---------- #!/usr/bin/env python def read_lines(fname): words_count = {} f = file (fname, "r") l_cnt = 0 w_cnt = 0 words_array = [] char_cpt = 0 inword = False for l in f: l_cnt+=1 for c in l: char_cpt += 1 if not c.isspace(): if inword == False: buf = "" buf += c inword = True w_cnt +=1 #end if else: buf += c #end else else: if inword ==True: #print "buf = %s" % buf if not buf in words_count: words_count[buf]=0 else: words_count[buf]+=1 inword = False buf = "" #end if #end else #end for c #end for l for key in words_count: words_array.append((words_count[key], key)) words_array.sort(reverse=True) for item in words_array: print "(%s,%s)" % (item[0], item[1]) print "lines= %d" % l_cnt print "words = %d" % w_cnt if __name__ == "__main__": read_lines("../../shakespeare.txt") Go language: ----------- package main import ( "fmt" "os" "bufio" "unicode" "sort" ) //--- type int_word_array []int_word // Methods required by sort.Interface for to sort structures of the type int_word. func (s int_word_array) Len() int { return len(s) } func (s int_word_array) Less(i, j int) bool { return s[i].cpt > s[j].cpt } //(reverse sort) func (s int_word_array) Swap(i, j int) { s[i], s[j] = s[j], s[i] } type int_word struct { cpt int word string } //--- func main() int { words_map := map[string]int{} l_cnt := 0 w_cnt := 0 cpt_chars := 0 inword := false buf := "" f, err := os.Open("../shakespeare.txt", os.O_RDONLY, 0666) //f, err := os.Open("hamlet.txt", os.O_RDONLY, 0666) if err != nil { fmt.Printf("\nError => %s\n\n", err) os.Exit(1) } reader := bufio.NewReader(f) //Buffered reader for { c, _ ,err := reader.ReadRune() cpt_chars++ if err != os.EOF && err == nil { if c == '\n' { l_cnt++ } if unicode.IsSpace(c) == false { if inword == false { buf = "" buf += string(c) inword = true w_cnt++ } else { buf += string(c) } } else if inword == true { if _, ok := words_map[buf]; ok { words_map[buf]++ } else { words_map[buf] = 1 } //fmt.Printf("buf = (%s)\n", buf) inword = false buf = "" } } else { //EOF detected if err == os.EOF { break } } //end if (err=nil) } // end for (main loop) //--- var words_map_size int = len(words_map) var int_words int_word_array int_words = make(int_word_array, words_map_size) var iw int_word int_words_index := 0 for word, cpt := range words_map {å //fmt.Printf("%d =\t\t%s\n", cpt, word) iw.cpt = cpt iw.word = word int_words[int_words_index] = iw int_words_index++ } sort.Sort(int_words) for _, item := range int_words[0:100] { fmt.Printf("(%d,%s)\n", item.cpt, item.word) } //--- fmt.Printf("\nlines = %d, words = %d, chars = %d\n", l_cnt, w_cnt, cpt_chars) return 0 } -------------- Serge Hulne