Permalink
Browse files

add graph

  • Loading branch information...
1 parent d7ff20d commit 1f3656faef895fa6f9f64cadb576c1baa3bd44f5 drill committed Mar 17, 2013
Showing with 55,147 additions and 57,543 deletions.
  1. +0 −5 .gitignore
  2. +0 −2 README.md
  3. +4 −0 advxxx.txt
  4. +259 −0 baseline.py
  5. +61 −0 cal_ngd.py
  6. +83 −0 cal_strength.py
  7. +115 −11 check.py
  8. +1 −2 evaluate.py
  9. +26 −0 fitMERGE.py
  10. +95 −0 graph.py
  11. +0 −21 neg.txt
  12. +1 −2 oov.txt
  13. BIN paper.gpickle
  14. +10,823 −11,748 phrase.txt
  15. +13,450 −14,129 phrase2.txt
  16. +1,980 −1,980 phraseINline.txt
  17. +1,978 −1,978 phraseINline2.txt
  18. +0 −10 pos.txt
  19. +10,780 −11,702 pos_phrase.txt
  20. +7,729 −8,038 pos_tagged.txt
  21. +4,336 −4,446 preprocess-neg.txt
  22. +3,320 −3,414 preprocess-pos.txt
  23. +29 −27 preprocess.py
  24. +2 −2 tagger.sh
  25. +75 −26 themain.py
View
@@ -1,13 +1,8 @@
*.pyc
*.doc
-*.txt~
-*.sh~
*~
*.txt
senti*
neg_*
pos_*
-maybeADV.txt
-1.txt
-2.txt
preprocess-*
View
@@ -1,3 +1 @@
-testing ................
-
sentiNN.txt: include words which act as NN and each strength is zero.
View
@@ -71,3 +71,7 @@
真 1.1
一般 0.5
没法 -1
+由衷 1.2
+不予 -1
+一再 1.1
+不许 -1
View
@@ -0,0 +1,259 @@
+# -*- coding:utf-8 -*-
+from preprocess import *
+from evaluate import *
+from check import *
+import time
+
+def loadSENTI(path):
+ fo = open(path)
+ global sentiDICT
+ for line in fo:
+ line =line.strip()
+ if line:
+ li= line.split()
+ if len(li)==2:
+ try:
+ sentiDICT[li[0]]= float(li[1])
+ except:
+ print "type error, not number",line
+ print "Length of sentiment lexion in %s is %s " %(fo.name,len(sentiDICT))
+
+def loadLEXICON(path):
+ with open(path) as fo:
+ lexicon = {}
+ for line in fo:
+ line =line.strip()
+ if line:
+ li= line.split()
+ try:
+ lexicon[' '.join(li[0:-1])]= float(li[-1])
+ except:
+ print "type error, not number",line
+ print "Length of sentiment lexion in %s is %s " %(fo.name,len(lexicon))
+ return lexicon
+
+
+oov= set()
+def calPHRASEstrength(nonLINEAR,phrase,advDICT):
+ ### return none if oov
+ if not phrase:
+ return 0 ################# smth ???? should return none
+ li = phrase.split()
+ if len(li) ==1:
+ strength= sentiDICT.get(li[0])
+ if strength is None:
+ oov.add(li[0]);strength = 0
+ elif nonLINEAR.get(' '.join(li)):
+ strength = nonLINEAR.get(' '.join(li))
+ elif len(li)==2:
+ strength = sentiDICT.get(li[1])
+ if strength is None:
+ oov.add(li[1])
+ strength = 0
+ if li[0] == 'shift' and strength:
+ strength = strength - 4 if strength>0 else strength + 4
+ elif li[0]=="不太" and strength:
+ strength = strength - 5 if strength>0 else strength + 5
+ elif advDICT.get(li[0]):
+ strength*= advDICT.get(li[0])
+
+
+ elif len(li)==3:
+ strength= sentiDICT.get(li[2])
+ if strength is None:
+ oov.add(li[2])
+ strength = 0
+ if advDICT.get(li[1]):
+ strength*=advDICT.get(li[1])
+ ## DO SHIFT(4)
+ if li[0] in ['shift','','没有']:
+ if strength>0:
+ strength-=4
+ elif strength<0:
+ strength+=4
+ else:
+ if advDICT.get(li[0]):
+ strength*= advDICT.get(li[0])
+
+ else:
+ length = len(li)
+ strength= sentiDICT.get(li[length-1])
+ if strength is None:
+ oov.add(li[length-1])
+ strength = 0
+ for i in range(length-2,-1,-1):
+ if advDICT.get(li[i]):
+ strength*=advDICT.get(li[i])
+ if strength < 0:
+ strength = strength*1.5
+
+## if droppoint,comment two lines above
+ return strength
+
+
+def readFILEasDICT(path):
+ dict={}
+ fo = open(path)
+ for line in fo:
+ line=line.strip()
+ if line:
+ li=line.split()
+ if len(li)==2:
+ try:
+ dict[li[0]]=float(li[1])
+ except:
+ print "type error, not number"
+ print "the length of dictionary builded from file is %s" %(len(dict))
+ return dict
+
+
+def calALL(nonLINEAR,advDICTfilePATH,inputPATH,outputPATH):
+ fo = open(inputPATH)
+ fw = open(outputPATH,'w')
+ advDICT = readFILEasDICT(advDICTfilePATH)
+ list=[]
+ for line in fo:
+ line=line.strip()
+ if line!='----------':
+ if line =='SUM':
+ list.append('s')
+ else:
+ list.append(str(calPHRASEstrength(nonLINEAR,line,advDICT)))
+ else:
+ fw.write("|".join(list)+"\n")
+ list=[]
+ fw.close()
+
+def statistics(phraseNUMBERseqs):
+ errorLIST = []
+ dict ={1:0,0:0,-1:0}
+ with open(phraseNUMBERseqs) as myFILE:
+ for num, line in enumerate(myFILE, 1):
+ line=line.strip()
+ strength1 = findSENTIdroppoint(line)
+ #strength = commonSENTI(line)
+ ''' mix two methods '''
+ strength2 = commonSENTI(line)
+ if strength1 * strength2 > 0:
+ strength = strength2
+ elif strength1 == 0:
+ strength = strength2
+ elif strength2 == 0:
+ strength = strength1
+ else:
+
+ if strength1 >0 and strength2 < 0: #######
+ strength = strength1
+ elif strength1 < 0 and strength2 > 0:
+ strength = strength2
+ else:
+ strength = (strength1+strength2)/2
+ #####strength = strength2
+## if strength > 0:
+## errorLIST.append(num)
+ dict[calORIENTATION(strength)]+=1
+ print dict
+ print "my union method:"
+ print "the correct percentage is %s" %(dict[-1]/2000.0)
+ return errorLIST
+
+
+def statistics2(phraseNUMBERseqs):
+ dict ={1:0,0:0,-1:0}
+ with open(phraseNUMBERseqs) as myFILE:
+ for num, line in enumerate(myFILE, 1):
+ line=line.strip()
+ eles = line.split('|')
+ sum = 0
+ for i in eles:
+ try:
+ value = float(i)
+ except:
+ pass
+ if value > 0:
+ sum+=1
+ elif value < 0:
+ sum-=1
+
+
+
+ dict[calORIENTATION(sum)]+=1
+ print ''' *****count stat:'''
+ print dict
+ print "the correct percentage is %s" %(dict[-1]/2000.0)
+
+
+if __name__ == '__main__':
+ print "starts",time.asctime()
+ print '''
+**notice : the preprocess 163 line , if segmenter is changed!
+'''
+ taggedFILE='./neg_tagged.txt'
+ phraseFILE='./neg_phrase.txt'
+ parsedFILE='./neg_parsed_format.txt'
+ finalPHRASE='./phrase2.txt'
+ phraseNUMBERseqs='./phraseINline2.txt'
+
+ #preprocess("preprocess-neg.txt")
+ #segANDpos("preprocess-neg.txt")
+ #reformPARSED('neg_parsed.txt',parsedFILE)
+
+## taggedFILE='./pos_tagged.txt'
+## phraseFILE='./pos_phrase.txt'
+## parsedFILE='./pos_parsed_format.txt'
+## finalPHRASE='./phrase.txt'
+## phraseNUMBERseqs='./phraseINline.txt'
+
+## preprocess("preprocess-pos.txt")
+## segANDpos("preprocess-pos.txt")
+ #reformPARSED('pos_parsed.txt',parsedFILE)
+
+
+### notebook block
+## taggedFILE='./neg_tagged.txt'
+## phraseFILE='./neg_phrasenb.txt'
+## parsedFILE='./neg_parsed_formatnb.txt'
+## finalPHRASE='./phrase2nb.txt'
+## phraseNUMBERseqs='./phraseINline2nb.txt'
+
+## preprocess("preprocess-neg.txt")
+## segANDpos("preprocess-neg.txt")
+## reformPARSED('neg_parsednb.txt',parsedFILE)
+
+## taggedFILE='./pos_tagged.txt'
+## phraseFILE='./pos_phrasenb.txt'
+## parsedFILE='./pos_parsed_formatnb.txt'
+## finalPHRASE='./phrasenb.txt'
+## phraseNUMBERseqs='./phraseINlinenb.txt'
+
+## preprocess("preprocess-pos.txt")
+## segANDpos("preprocess-pos.txt")
+## reformPARSED('pos_parsednb.txt',parsedFILE)
+
+
+
+ sentiDICT = {}
+ #loadSENTI('./sentiment2.txt')
+ loadSENTI('./mySTRENGTH.txt') ### sync two files
+ findPHRASE(taggedFILE,parsedFILE,phraseFILE)
+ filterPHRASE(phraseFILE,finalPHRASE)
+ nonLINEAR = loadLEXICON('./nonlinear.txt')
+ calALL(nonLINEAR,'advxxx.txt',finalPHRASE,phraseNUMBERseqs)
+
+
+ ##### apply count method
+ errorLIST = statistics(phraseNUMBERseqs)
+
+
+ ###add new function to count the number of pos/neg
+ statistics2(phraseNUMBERseqs)
+ #writeERROR('preprocess-neg.txt',errorLIST)
+ recordOOV(oov)
+ print 'finished',time.asctime()
+
+
+
+
+
+
+
View
@@ -0,0 +1,61 @@
+import math,time
+print "starts",time.asctime()
+
+fo2=open("./single.txt")
+fo3=open('./merge.txt')
+fw = open("./ngd.txt",'w')
+
+dict = {}
+list=[]
+for line in fo2:
+ line=line.strip()
+ parts=line.split()
+ dict[parts[0]]=parts[1]
+ list.append(int(parts[1]))
+list.sort()
+print "the max indexed:",list[-1]
+print "there are %s query items " % len(dict)
+
+
+N=10000000000
+## 1204070806
+cnt=0
+total=0
+lost = set()
+for line in fo3:
+ line=line.strip()
+ parts=line.split()
+ total+=1
+ a,b= parts[0],parts[1]
+ try:
+ Fx=int(dict.get(a));
+ except:
+ #print "miss a",a
+ lost.add(a)
+ try:
+ Fy=int(dict.get(b));
+ except:
+ #print "miss b",b
+ lost.add(b)
+ try:
+ Fxy=int(parts[2])
+ except:
+ print "miss fxy::",line
+ logx=math.log(Fx);logy=math.log(Fy);logxy=math.log(Fxy)
+ ngd=(max(logx,logy)-logxy) / float(math.log(N)-min(logx,logy))
+ if ngd >1:
+ cnt+=1
+ ngd= 1 ##### ??
+ fw.write(' '.join(parts[0:2])+'---'+str(ngd)+"\n")
+
+print "times that ngd bigger than one:",cnt
+print "the radio is :", float(cnt)/total
+fw.close()
+
+for i in lost:
+ print i
+
+print "end.:",time.asctime()
+
+
+
Oops, something went wrong.

0 comments on commit 1f3656f

Please sign in to comment.