done with phase 1

chriskelvinlee · Dec 7, 2011 · 1e5dc44 · 1e5dc44
1 parent ce917e6
commit 1e5dc44
Show file tree

Hide file tree

Showing 13 changed files with 256 additions and 98 deletions.
diff --git a/determine.py b/determine.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+determine.py
+
+Created by Christopher K. Lee on 2011-12-07.
+Copyright (c) 2011 __MyCompanyName__. All rights reserved.
+"""
+import time
+
+def determineAnswer(results, choices, correct):
+    determine_start_time = time.time()
+    points = [0, 0, 0]
+    confidence = 0
+
+    # Loop through all score dict results
+    for r in results:
+        # Score set cannot be empty
+        if ( r != {}): 
+
+            # Max and min values  
+            highestChoice = ""
+            a1 = min(r.values())
+            z1 = max(r.values())
+
+            # Iterate through dict
+            for key, val in r.iteritems():
+                if val == z1:                    # If value is max
+                    for c in choices:            
+                        if key == c:            # Max key is choice
+                            points[choices.index(c)] += 1      # Add point value  
+                            highestChoice = key          
+                t = float(val)
+                if a1 != 0:
+                    r[key] = t/a1                    # Normalize             
+
+            # Find normalized min val
+            a2 = min(r.values())
+            z2 = max(r.values())
+
+            # Multiplier of 1st vs 2nd
+            for val in r.itervalues():
+                if val != a2 and val != z2:
+                    confidence += r[highestChoice]/val
+
+    determine_stop_time = time.time()
+    de_time = (determine_stop_time - determine_start_time)
+
+    # Return true if correct w/ confidence
+    answer_index = points.index(max(points))
+    if answer_index == correct:
+        return [[1, answer_index], confidence, de_time]
+    else:
+        return [[0, answer_index], confidence, de_time]
diff --git a/old/starter_code.py b/old/starter_code.py
@@ -478,7 +478,7 @@ def getSynonyms(keyword):
 
 """
 
-compute_score(queryphrase="Whose favorite place to swim is in his money bin", answers=["Scrooge McDuck", "Richie Rich", "Ebenezer Scrooge"], output="results/query3.txt")
+#compute_score(queryphrase="Whose favorite place to swim is in his money bin", answers=["Scrooge McDuck", "Richie Rich", "Ebenezer Scrooge"], output="results/query3.txt")
 
 """Results, with answer choices broken into fragments, with scores of 1 for fragment, 10 for whole answer, times 2 if near keywords:
 ['Whose', 'bin', 'favorite'] [WANT TO INCLUDE PLACE, SWIM]

diff --git a/output.py b/output.py
@@ -7,23 +7,37 @@
 Copyright (c) 2011 __MyCompanyName__. All rights reserved.
 """
 
+def output(text, choices, correct, nltk_data, results, answer, conf,
+            nltk_time, ai_time, de_time, currentCount, outputSet):
 
-    f = open('query{ }.txt'.format(QUERYNUM), 'w')
+    url         = nltk_time[0]
+    keyword_q   = nltk_time[1]
+    keyword_a   = nltk_time[2]
+    token_q     = nltk_time[3]
+    token_url   = nltk_time[4]
+    instances   = nltk_time[5]
+    score1      = ai_time[0]
+    score2      = ai_time[1]
+    score3      = ai_time[2]
+    score4      = ai_time[3]
+    decide      = de_time
+    nltk        = keyword_q+keyword_a+token_q+token_url
+    ai          = url+instances+score1+score2+score3+score4+decide
+    total       = nltk+ai
+
+
 
     # write to file
-    f = open(output, 'w')
-    print >>f, queryphrase
-    print >>f, keywords
-    print >>f, weightedquestionkeywords        
-    print >>f, weightedanswerkeywords
-    print >>f, score_result1
-    print >>f, score_result2
-    print >>f, score_result3
-    print >>f, score_result4
+    f = open('results/query{}.txt'.format(currentCount), 'w')
+    print >>f, text
+    print >>f, choices
+    print >>f, correct
+    print >>f, "Answer:\t\t%d"          % answer[1]
+    print >>f, "Confidence:\t%d"        % conf
     print >>f, '\n#####'
     print >>f, "Total Time:\t\t%f"       % total
-    print >>f, "NLTK Time:\t\t%f"        % (keyword_q+keyword_a+token_q+token_url)
-    print >>f, "AI Time:\t\t%f"          % (url+instances+score1+score2+score3+score4)
+    print >>f, "NLTK Time:\t\t%f"        % nltk
+    print >>f, "AI Time:\t\t%f"          % ai
     print >>f, "URL Time:\t\t%f"         % url
     print >>f, "Keyword (Q) Time:\t%f"   % keyword_q
     print >>f, "Keyword (A) Time:\t%f"   % keyword_a
@@ -34,5 +48,15 @@
     print >>f, "Score2 Time:\t\t%f"   % score2
     print >>f, "Score3 Time:\t\t%f"   % score3
     print >>f, "Score4 Time:\t\t%f"   % score4
-    print >>f, '#####'
+    print >>f, '#####\n'
+    print >>f, "Keywords (Q):\t\t%s"    % str(nltk_data[0]) 
+    print >>f, "Keywords (Qw):\t\t%s"   % str(nltk_data[1])
+    print >>f, "Keywords (Aw):\t\t%s"   % str(nltk_data[2]) 
+    print >>f, "Tokens (Query):\t%s"   % str(nltk_data[3]) 
+    print >>f, "Tokens (Combo):\t%s"   % str(nltk_data[4])
+    print >>f, "score_result1:\t%s"   % str(results[0]) 
+    print >>f, "score_result2:\t%s"   % str(results[1])
+    print >>f, "score_result3:\t%s"   % str(results[2])
+    print >>f, "score_result4:\t%s"   % str(results[3])
+
     f.close()
diff --git a/questions.py b/questions.py
@@ -5,3 +5,17 @@
 ["What is each member of a winning Super Bowl team given", ["a bronzed jersey", "a green jacket", "a ring"], 2],
 ["How much time does it take for the space shuttle to get into space", ["about 10 minutes", "4 hours", "2 days"], 0],
 ]
+
+
+"""debug code for interpreter
+queryphrase="What is each member of a winning Super Bowl team given"
+answers=["a bronzed jersey", "a green jacket", "a ring"]
+urls = getGoogleLinks(queryphrase, 1)
+keywords = getSimpleQuestionKeywords(queryphrase)
+weightedquestionkeywords = getWeightedQuestionKeywords(queryphrase)
+weightedanswerkeywords = getAnswerKeywords(answers)
+querytokens = nltk.word_tokenize(queryphrase)
+combinedtokens = getTokens(urls)
+instances = getInstances(keywords, combinedtokens)
+
+"""
diff --git a/results/prequery1.txt b/results/prequery1.txt
@@ -0,0 +1,23 @@
+Which of these describes the tail of a healthy platypus
+['platypus', 'tail', 'describes', 'healthy']
+{'healthy': (66, 'JJ'), 'tail': (24, 'NN'), 'describes': (26, 'NNS'), 'platypus': (0, 'NN')}
+{'short': (619, 'JJ'), 'long': (1285, 'RB'), 'fat': (80, 'JJ'), 'strong': (569, 'JJ'), 'squishy': (0, 'JJ'), 'pinkish': (0, 'NN')}
+{}
+{'fat and strong': 31, 'short and pinkish': 21, 'long and squishy': 61}
+{'fat and strong': 87436, 'short and pinkish': 43, 'long and squishy': 41256}
+{'fat and strong': 35894, 'short and pinkish': 0, 'long and squishy': 0}
+#####
+Total Time:		94.278013
+NLTK Time:		92.599376
+AI Time:		1.678637
+URL Time:		0.658323
+Keyword (Q) Time:	26.682972
+Keyword (A) Time:	21.475645
+Tokens (Q) Time:	0.000251
+Tokens (URL) Time:	44.440508
+Map Instance Time:	0.023398
+Score1 Time:		0.128347
+Score2 Time:		0.101162
+Score3 Time:		0.341729
+Score4 Time:		0.425678
+#####
diff --git a/results/prequery2.txt b/results/prequery2.txt
@@ -0,0 +1,24 @@
+Which sport do players use a stick to cradle the ball
+['sport', 'players', 'stick', 'ball']
+{'players': (33, 'NNS'), 'sport': (20, 'NN'), 'stick': (59, 'NN'), 'ball': (109, 'NN')}
+{'lacrosse': (0, 'NN'), 'field': (432, 'NN'), 'hockey': (1, 'NN'), 'ice': (56, 'NN')}
+{'lacrosse': 156, 'field hockey': 19}
+{'lacrosse': 156, 'ice hockey': 24, 'field hockey': 114}
+{'lacrosse': 276910, 'ice hockey': 5484, 'field hockey': 71631}
+{'lacrosse': 105350, 'ice hockey': 184708, 'field hockey': 210210}
+
+#####
+Total Time:		71.615634
+NLTK Time:		70.140234
+AI Time:		1.475400
+URL Time:		0.645202
+Keyword (Q) Time:	29.591298
+Keyword (A) Time:	26.574528
+Tokens (Q) Time:	0.000405
+Tokens (URL) Time:	13.974003
+Map Instance Time:	0.007040
+Score1 Time:		0.032136
+Score2 Time:		0.019258
+Score3 Time:		0.151532
+Score4 Time:		0.620232
+#####
diff --git a/results/query3.txt → results/prequery3.txt b/results/query3.txt → results/prequery3.txt
diff --git a/results/query1.txt b/results/query1.txt
diff --git a/results/query2.txt b/results/query2.txt
diff --git a/scoring.py b/scoring.py
@@ -7,19 +7,18 @@
 
 
 
-keywords, weightedquestionkeywords, weightedanswerkeywords, querytokens, combinedtokens, instances
-
-def score(answers, nd, scoringFunction):
+def score(choices, nd, scoringFunction):
+    answers                     = choices
     keywords                    = nd[0]
     combinedtokens              = nd[4]
     instances                   = nd[5]
-    weightedquestionkeywords    = nd[2]
-    weightedanswerkeywords      = nd[1]
-    scoringFunction(answers, keywords, combinedtokens, instances, weightedquestionkeywords, weightedanswerkeywords)
+    weightedquestionkeywords    = nd[1]
+    weightedanswerkeywords      = nd[2]
+    return scoringFunction(answers, keywords, combinedtokens, instances, weightedquestionkeywords, weightedanswerkeywords)
 
 
 # Use all scores
-def useAllWeights(answers, keywords, combinedtokens, instances, weightedquestionkeywords, weightedanswerkeywords):
+def useAllScores(answers, keywords, combinedtokens, instances, weightedquestionkeywords, weightedanswerkeywords):
     time1_start_time = time.time()
     weights1 = getSimpleAnswerPhraseScores(answers, keywords, combinedtokens, instances, weightedquestionkeywords, weightedanswerkeywords)
     time1_stop_time = time.time()
@@ -29,7 +28,7 @@ def useAllWeights(answers, keywords, combinedtokens, instances, weightedquestion
     time2_stop_time = time.time()   
 
     time3_start_time = time.time()
-    weights3 = getWeightedQuestionKeywordScores(answers, keywords, combinedtokens, instances, weightedquestionkeywords, weightedanswerkeywords)
+    weights3 =  getWeightedQuestionKeywordScores(answers, keywords, combinedtokens, instances, weightedquestionkeywords, weightedanswerkeywords)
     time3_stop_time = time.time()  
 
     time4_start_time = time.time()       
@@ -108,7 +107,7 @@ def getSimpleAnswerKeywordScores(answers, keywords, combinedtokens, instances, w
 #########
 
 def getWeightedQuestionKeywordScores(answers, keywords, combinedtokens, instances, weightedquestionkeywords, weightedanswerkeywords):
-    rangevalue = 50
+    rangevalue = getRangeValue()
     scores = {}
     tokenrange = findrange(len(combinedtokens))
     for answer in answers:
@@ -147,7 +146,7 @@ def getWeightedQuestionKeywordScores(answers, keywords, combinedtokens, instance
     return scores
 
 #########
-## Score 3 ##   
+## Score 4 ##   
 # use function below to score, using question keywords and answer keywords
 #
 #########

diff --git a/test.py b/test.py
@@ -2,52 +2,53 @@
 from trivialpursuitfunctions import *
 from scoring import *
 from output import *
-from decide import *
+from determine import *
 
 #What other options should test take in?
-def test( questions = tp_Questions, scoringFunction = useAllWeights ):
-    numberCorrect = 0;
+def runQuery( questions = tp_Questions, scoringFunction = useAllScores, outputSet = 1 ):
+    numberCorrect = 0
+    currentCount = 0
     for question in questions:
+        currentCount += 1
+        print "*****"
+        print "Processing Query %d of %d " % (currentCount, len(questions))
+
         # Read in question, choices, and correct answer
         text = question[0]
         choices = question[1]
         correct = question[2]
 
+        print "Processing NLTK..."
         # Parse urls, questions, answers and generate keywords
-        raw = NLTK_parse( queryphrase=text, answers=choices )
+        raw = NLTK_parse(queryphrase=text, answers=choices)
         nltk_data = raw[0]          # array of size 6
         nltk_time = raw[1]          # array of size 6
-
+
+        print "AI & Scoring..."
         # Get answer weight with scoring function(s)
-        weights = score(answers, nltk_data, scoringFunction)
-        candidate = weights[0]      # array of size 4
+        weights = score(choices, nltk_data, scoringFunction)
+        results = weights[0]      # array of size 4
         ai_time = weights[1]        # array of size 4
+
+        print "Determining Answer..."
+        # Normalize to determine answer
+        correctness = determineAnswer(results, choices, correct)
+        answer  = correctness[0]
+        conf    = correctness[1]
+        de_time = correctness[2]
+
+        # Save the results sys.out txt
+        output(text, choices, correct, nltk_data, results, answer, conf,
+            nltk_time, ai_time, de_time, currentCount, outputSet)
+        print "*****"
 
-        # Determine correct results
+        # Print results live
         bld = text + ": "
-        if getHighestResult(result, choices) == correct:
+        if answer[0] == 1:
             numberCorrect += 1
             bld += "Correct"
         else:
             bld += "Incorrect"
         print bld
 
-        # Output results        
-
     print str(numberCorrect) + "/" + str(len(questions))
-
-
-def getHighestResult(result, choices):
-    highestConfidence = 0
-    highestChoice = ""
-    for r in result:
-        if result[r] > highestConfidence:
-            highestChoice = r
-            highestConfidence = result[r]
-
-    index = 0
-    for c in choices:
-        if c == highestChoice:
-            break
-        index += 1
-    return index
diff --git a/trivialpursuitfunctions.py b/trivialpursuitfunctions.py
@@ -108,7 +108,7 @@ def NLTK_parse(queryphrase="", answers=[], urls=[], scoringFunction = getSimpleA
     # Get urls
     url_start_time = time.time()
     if urls == []:
-        urls = getGoogleLinks(queryphrase, 3) # may want to change this number
+        urls = getGoogleLinks(queryphrase, 1) # may want to change this number
     url_stop_time = time.time()        
 
     # Get question keywords
@@ -147,10 +147,10 @@ def NLTK_parse(queryphrase="", answers=[], urls=[], scoringFunction = getSimpleA
     keyword_a = (keyword_a_stop_time - keyword_a_start_time)
     token_q =   (tokens_q_stop_time - tokens_q_start_time)
     token_url = (tokens_url_stop_time - tokens_url_start_time)
-    instances = (instances_stop_time - instances_start_time)
+    instances_t = (instances_stop_time - instances_start_time)
 
     NLTK = [keywords, weightedquestionkeywords, weightedanswerkeywords, querytokens, combinedtokens, instances]
-    TIME = [url, keyword_q, keyword_a, token_q, token_url, instances]
+    TIME = [url, keyword_q, keyword_a, token_q, token_url, instances_t]
 
     return [NLTK, TIME]
 

diff --git a/weights.py b/weights.py
@@ -18,7 +18,6 @@ def calculateAnswerKeywordWeight(answertoken, weightedanswerkeywords):
 def calculateDistanceWeight(distance):
     return (50 / (distance + 1)) # optimize
 
-
 def calculateInstanceScore(answertoken, keywords, distances, weightedquestionkeywords, weightedanswerkeywords, full):
     newscore = 0
     for keyword in keywords:
@@ -28,4 +27,7 @@ def calculateInstanceScore(answertoken, keywords, distances, weightedquestionkey
                     newscore += calculateQuestionKeywordWeight(keyword, weightedquestionkeywords) * 10 * calculateDistanceWeight(distance) # optimize
                 else:
                     newscore += calculateQuestionKeywordWeight(keyword, weightedquestionkeywords) * calculateAnswerKeywordWeight(answertoken, weightedanswerkeywords) * calculateDistanceWeight(distance) # optimize
-    return newscore
+    return newscore
+
+def getRangeValue():
+    return 50