Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
add graph
  • Loading branch information
drill committed Mar 17, 2013
1 parent d7ff20d commit 1f3656f
Show file tree
Hide file tree
Showing 25 changed files with 55,147 additions and 57,543 deletions.
5 changes: 0 additions & 5 deletions .gitignore
@@ -1,13 +1,8 @@
*.pyc
*.doc
*.txt~
*.sh~
*~
*.txt
senti*
neg_*
pos_*
maybeADV.txt
1.txt
2.txt
preprocess-*
2 changes: 0 additions & 2 deletions README.md
@@ -1,3 +1 @@
testing ................

sentiNN.txt: include words which act as NN and each strength is zero.
4 changes: 4 additions & 0 deletions advxxx.txt
Expand Up @@ -71,3 +71,7 @@
真 1.1
一般 0.5
没法 -1
由衷 1.2
不予 -1
一再 1.1
不许 -1
259 changes: 259 additions & 0 deletions baseline.py
@@ -0,0 +1,259 @@
# -*- coding:utf-8 -*-
from preprocess import *
from evaluate import *
from check import *
import time

def loadSENTI(path):
fo = open(path)
global sentiDICT
for line in fo:
line =line.strip()
if line:
li= line.split()
if len(li)==2:
try:
sentiDICT[li[0]]= float(li[1])
except:
print "type error, not number",line
print "Length of sentiment lexion in %s is %s " %(fo.name,len(sentiDICT))

def loadLEXICON(path):
with open(path) as fo:
lexicon = {}
for line in fo:
line =line.strip()
if line:
li= line.split()
try:
lexicon[' '.join(li[0:-1])]= float(li[-1])
except:
print "type error, not number",line
print "Length of sentiment lexion in %s is %s " %(fo.name,len(lexicon))
return lexicon


oov= set()
def calPHRASEstrength(nonLINEAR,phrase,advDICT):
### return none if oov
if not phrase:
return 0 ################# smth ???? should return none
li = phrase.split()
if len(li) ==1:
strength= sentiDICT.get(li[0])
if strength is None:
oov.add(li[0]);strength = 0
elif nonLINEAR.get(' '.join(li)):
strength = nonLINEAR.get(' '.join(li))
elif len(li)==2:
strength = sentiDICT.get(li[1])
if strength is None:
oov.add(li[1])
strength = 0
if li[0] == 'shift' and strength:
strength = strength - 4 if strength>0 else strength + 4
elif li[0]=="不太" and strength:
strength = strength - 5 if strength>0 else strength + 5
elif advDICT.get(li[0]):
strength*= advDICT.get(li[0])


elif len(li)==3:
strength= sentiDICT.get(li[2])
if strength is None:
oov.add(li[2])
strength = 0
if advDICT.get(li[1]):
strength*=advDICT.get(li[1])
## DO SHIFT(4)
if li[0] in ['shift','没','没有']:
if strength>0:
strength-=4
elif strength<0:
strength+=4
else:
if advDICT.get(li[0]):
strength*= advDICT.get(li[0])

else:
length = len(li)
strength= sentiDICT.get(li[length-1])
if strength is None:
oov.add(li[length-1])
strength = 0
for i in range(length-2,-1,-1):
if advDICT.get(li[i]):
strength*=advDICT.get(li[i])
if strength < 0:
strength = strength*1.5

## if droppoint,comment two lines above
return strength


def readFILEasDICT(path):
dict={}
fo = open(path)
for line in fo:
line=line.strip()
if line:
li=line.split()
if len(li)==2:
try:
dict[li[0]]=float(li[1])
except:
print "type error, not number"
print "the length of dictionary builded from file is %s" %(len(dict))
return dict


def calALL(nonLINEAR,advDICTfilePATH,inputPATH,outputPATH):
fo = open(inputPATH)
fw = open(outputPATH,'w')
advDICT = readFILEasDICT(advDICTfilePATH)
list=[]
for line in fo:
line=line.strip()
if line!='----------':
if line =='SUM':
list.append('s')
else:
list.append(str(calPHRASEstrength(nonLINEAR,line,advDICT)))
else:
fw.write("|".join(list)+"\n")
list=[]
fw.close()

def statistics(phraseNUMBERseqs):
errorLIST = []
dict ={1:0,0:0,-1:0}
with open(phraseNUMBERseqs) as myFILE:
for num, line in enumerate(myFILE, 1):
line=line.strip()
strength1 = findSENTIdroppoint(line)
#strength = commonSENTI(line)
''' mix two methods '''
strength2 = commonSENTI(line)
if strength1 * strength2 > 0:
strength = strength2
elif strength1 == 0:
strength = strength2
elif strength2 == 0:
strength = strength1
else:

if strength1 >0 and strength2 < 0: #######
strength = strength1
elif strength1 < 0 and strength2 > 0:
strength = strength2
else:
strength = (strength1+strength2)/2
#####strength = strength2
## if strength > 0:
## errorLIST.append(num)
dict[calORIENTATION(strength)]+=1
print dict
print "my union method:"
print "the correct percentage is %s" %(dict[-1]/2000.0)
return errorLIST


def statistics2(phraseNUMBERseqs):
dict ={1:0,0:0,-1:0}
with open(phraseNUMBERseqs) as myFILE:
for num, line in enumerate(myFILE, 1):
line=line.strip()
eles = line.split('|')
sum = 0
for i in eles:
try:
value = float(i)
except:
pass
if value > 0:
sum+=1
elif value < 0:
sum-=1



dict[calORIENTATION(sum)]+=1
print ''' *****count stat:'''
print dict
print "the correct percentage is %s" %(dict[-1]/2000.0)


if __name__ == '__main__':
print "starts",time.asctime()
print '''
**notice : the preprocess 163 line , if segmenter is changed!
'''
taggedFILE='./neg_tagged.txt'
phraseFILE='./neg_phrase.txt'
parsedFILE='./neg_parsed_format.txt'
finalPHRASE='./phrase2.txt'
phraseNUMBERseqs='./phraseINline2.txt'

#preprocess("preprocess-neg.txt")
#segANDpos("preprocess-neg.txt")
#reformPARSED('neg_parsed.txt',parsedFILE)

## taggedFILE='./pos_tagged.txt'
## phraseFILE='./pos_phrase.txt'
## parsedFILE='./pos_parsed_format.txt'
## finalPHRASE='./phrase.txt'
## phraseNUMBERseqs='./phraseINline.txt'

## preprocess("preprocess-pos.txt")
## segANDpos("preprocess-pos.txt")
#reformPARSED('pos_parsed.txt',parsedFILE)


### notebook block
## taggedFILE='./neg_tagged.txt'
## phraseFILE='./neg_phrasenb.txt'
## parsedFILE='./neg_parsed_formatnb.txt'
## finalPHRASE='./phrase2nb.txt'
## phraseNUMBERseqs='./phraseINline2nb.txt'

## preprocess("preprocess-neg.txt")
## segANDpos("preprocess-neg.txt")
## reformPARSED('neg_parsednb.txt',parsedFILE)

## taggedFILE='./pos_tagged.txt'
## phraseFILE='./pos_phrasenb.txt'
## parsedFILE='./pos_parsed_formatnb.txt'
## finalPHRASE='./phrasenb.txt'
## phraseNUMBERseqs='./phraseINlinenb.txt'

## preprocess("preprocess-pos.txt")
## segANDpos("preprocess-pos.txt")
## reformPARSED('pos_parsednb.txt',parsedFILE)



sentiDICT = {}
#loadSENTI('./sentiment2.txt')
loadSENTI('./mySTRENGTH.txt') ### sync two files
findPHRASE(taggedFILE,parsedFILE,phraseFILE)
filterPHRASE(phraseFILE,finalPHRASE)
nonLINEAR = loadLEXICON('./nonlinear.txt')
calALL(nonLINEAR,'advxxx.txt',finalPHRASE,phraseNUMBERseqs)


##### apply count method
errorLIST = statistics(phraseNUMBERseqs)


###add new function to count the number of pos/neg
statistics2(phraseNUMBERseqs)
#writeERROR('preprocess-neg.txt',errorLIST)
recordOOV(oov)
print 'finished',time.asctime()







61 changes: 61 additions & 0 deletions cal_ngd.py
@@ -0,0 +1,61 @@
import math,time
print "starts",time.asctime()

fo2=open("./single.txt")
fo3=open('./merge.txt')
fw = open("./ngd.txt",'w')

dict = {}
list=[]
for line in fo2:
line=line.strip()
parts=line.split()
dict[parts[0]]=parts[1]
list.append(int(parts[1]))
list.sort()
print "the max indexed:",list[-1]
print "there are %s query items " % len(dict)


N=10000000000
## 1204070806
cnt=0
total=0
lost = set()
for line in fo3:
line=line.strip()
parts=line.split()
total+=1
a,b= parts[0],parts[1]
try:
Fx=int(dict.get(a));
except:
#print "miss a",a
lost.add(a)
try:
Fy=int(dict.get(b));
except:
#print "miss b",b
lost.add(b)
try:
Fxy=int(parts[2])
except:
print "miss fxy::",line
logx=math.log(Fx);logy=math.log(Fy);logxy=math.log(Fxy)
ngd=(max(logx,logy)-logxy) / float(math.log(N)-min(logx,logy))
if ngd >1:
cnt+=1
ngd= 1 ##### ??
fw.write(' '.join(parts[0:2])+'---'+str(ngd)+"\n")

print "times that ngd bigger than one:",cnt
print "the radio is :", float(cnt)/total
fw.close()

for i in lost:
print i

print "end.:",time.asctime()



0 comments on commit 1f3656f

Please sign in to comment.