Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

add graph

  • Loading branch information...
commit 1f3656faef895fa6f9f64cadb576c1baa3bd44f5 1 parent d7ff20d
drill authored
5 .gitignore
View
@@ -1,13 +1,8 @@
*.pyc
*.doc
-*.txt~
-*.sh~
*~
*.txt
senti*
neg_*
pos_*
-maybeADV.txt
-1.txt
-2.txt
preprocess-*
2  README.md
View
@@ -1,3 +1 @@
-testing ................
-
sentiNN.txt: include words which act as NN and each strength is zero.
4 advxxx.txt
View
@@ -71,3 +71,7 @@
真 1.1
一般 0.5
没法 -1
+由衷 1.2
+不予 -1
+一再 1.1
+不许 -1
259 baseline.py
View
@@ -0,0 +1,259 @@
+# -*- coding:utf-8 -*-
+from preprocess import *
+from evaluate import *
+from check import *
+import time
+
+def loadSENTI(path):
+ fo = open(path)
+ global sentiDICT
+ for line in fo:
+ line =line.strip()
+ if line:
+ li= line.split()
+ if len(li)==2:
+ try:
+ sentiDICT[li[0]]= float(li[1])
+ except:
+ print "type error, not number",line
+ print "Length of sentiment lexion in %s is %s " %(fo.name,len(sentiDICT))
+
+def loadLEXICON(path):
+ with open(path) as fo:
+ lexicon = {}
+ for line in fo:
+ line =line.strip()
+ if line:
+ li= line.split()
+ try:
+ lexicon[' '.join(li[0:-1])]= float(li[-1])
+ except:
+ print "type error, not number",line
+ print "Length of sentiment lexion in %s is %s " %(fo.name,len(lexicon))
+ return lexicon
+
+
+oov= set()
+def calPHRASEstrength(nonLINEAR,phrase,advDICT):
+ ### return none if oov
+ if not phrase:
+ return 0 ################# smth ???? should return none
+ li = phrase.split()
+ if len(li) ==1:
+ strength= sentiDICT.get(li[0])
+ if strength is None:
+ oov.add(li[0]);strength = 0
+ elif nonLINEAR.get(' '.join(li)):
+ strength = nonLINEAR.get(' '.join(li))
+ elif len(li)==2:
+ strength = sentiDICT.get(li[1])
+ if strength is None:
+ oov.add(li[1])
+ strength = 0
+ if li[0] == 'shift' and strength:
+ strength = strength - 4 if strength>0 else strength + 4
+ elif li[0]=="不太" and strength:
+ strength = strength - 5 if strength>0 else strength + 5
+ elif advDICT.get(li[0]):
+ strength*= advDICT.get(li[0])
+
+
+ elif len(li)==3:
+ strength= sentiDICT.get(li[2])
+ if strength is None:
+ oov.add(li[2])
+ strength = 0
+ if advDICT.get(li[1]):
+ strength*=advDICT.get(li[1])
+ ## DO SHIFT(4)
+ if li[0] in ['shift','','没有']:
+ if strength>0:
+ strength-=4
+ elif strength<0:
+ strength+=4
+ else:
+ if advDICT.get(li[0]):
+ strength*= advDICT.get(li[0])
+
+ else:
+ length = len(li)
+ strength= sentiDICT.get(li[length-1])
+ if strength is None:
+ oov.add(li[length-1])
+ strength = 0
+ for i in range(length-2,-1,-1):
+ if advDICT.get(li[i]):
+ strength*=advDICT.get(li[i])
+ if strength < 0:
+ strength = strength*1.5
+
+## if droppoint,comment two lines above
+ return strength
+
+
+def readFILEasDICT(path):
+ dict={}
+ fo = open(path)
+ for line in fo:
+ line=line.strip()
+ if line:
+ li=line.split()
+ if len(li)==2:
+ try:
+ dict[li[0]]=float(li[1])
+ except:
+ print "type error, not number"
+ print "the length of dictionary builded from file is %s" %(len(dict))
+ return dict
+
+
+def calALL(nonLINEAR,advDICTfilePATH,inputPATH,outputPATH):
+ fo = open(inputPATH)
+ fw = open(outputPATH,'w')
+ advDICT = readFILEasDICT(advDICTfilePATH)
+ list=[]
+ for line in fo:
+ line=line.strip()
+ if line!='----------':
+ if line =='SUM':
+ list.append('s')
+ else:
+ list.append(str(calPHRASEstrength(nonLINEAR,line,advDICT)))
+ else:
+ fw.write("|".join(list)+"\n")
+ list=[]
+ fw.close()
+
+def statistics(phraseNUMBERseqs):
+ errorLIST = []
+ dict ={1:0,0:0,-1:0}
+ with open(phraseNUMBERseqs) as myFILE:
+ for num, line in enumerate(myFILE, 1):
+ line=line.strip()
+ strength1 = findSENTIdroppoint(line)
+ #strength = commonSENTI(line)
+ ''' mix two methods '''
+ strength2 = commonSENTI(line)
+ if strength1 * strength2 > 0:
+ strength = strength2
+ elif strength1 == 0:
+ strength = strength2
+ elif strength2 == 0:
+ strength = strength1
+ else:
+
+ if strength1 >0 and strength2 < 0: #######
+ strength = strength1
+ elif strength1 < 0 and strength2 > 0:
+ strength = strength2
+ else:
+ strength = (strength1+strength2)/2
+ #####strength = strength2
+## if strength > 0:
+## errorLIST.append(num)
+ dict[calORIENTATION(strength)]+=1
+ print dict
+ print "my union method:"
+ print "the correct percentage is %s" %(dict[-1]/2000.0)
+ return errorLIST
+
+
+def statistics2(phraseNUMBERseqs):
+ dict ={1:0,0:0,-1:0}
+ with open(phraseNUMBERseqs) as myFILE:
+ for num, line in enumerate(myFILE, 1):
+ line=line.strip()
+ eles = line.split('|')
+ sum = 0
+ for i in eles:
+ try:
+ value = float(i)
+ except:
+ pass
+ if value > 0:
+ sum+=1
+ elif value < 0:
+ sum-=1
+
+
+
+ dict[calORIENTATION(sum)]+=1
+ print ''' *****count stat:'''
+ print dict
+ print "the correct percentage is %s" %(dict[-1]/2000.0)
+
+
+if __name__ == '__main__':
+ print "starts",time.asctime()
+ print '''
+**notice : the preprocess 163 line , if segmenter is changed!
+'''
+ taggedFILE='./neg_tagged.txt'
+ phraseFILE='./neg_phrase.txt'
+ parsedFILE='./neg_parsed_format.txt'
+ finalPHRASE='./phrase2.txt'
+ phraseNUMBERseqs='./phraseINline2.txt'
+
+ #preprocess("preprocess-neg.txt")
+ #segANDpos("preprocess-neg.txt")
+ #reformPARSED('neg_parsed.txt',parsedFILE)
+
+## taggedFILE='./pos_tagged.txt'
+## phraseFILE='./pos_phrase.txt'
+## parsedFILE='./pos_parsed_format.txt'
+## finalPHRASE='./phrase.txt'
+## phraseNUMBERseqs='./phraseINline.txt'
+
+## preprocess("preprocess-pos.txt")
+## segANDpos("preprocess-pos.txt")
+ #reformPARSED('pos_parsed.txt',parsedFILE)
+
+
+### notebook block
+## taggedFILE='./neg_tagged.txt'
+## phraseFILE='./neg_phrasenb.txt'
+## parsedFILE='./neg_parsed_formatnb.txt'
+## finalPHRASE='./phrase2nb.txt'
+## phraseNUMBERseqs='./phraseINline2nb.txt'
+
+## preprocess("preprocess-neg.txt")
+## segANDpos("preprocess-neg.txt")
+## reformPARSED('neg_parsednb.txt',parsedFILE)
+
+## taggedFILE='./pos_tagged.txt'
+## phraseFILE='./pos_phrasenb.txt'
+## parsedFILE='./pos_parsed_formatnb.txt'
+## finalPHRASE='./phrasenb.txt'
+## phraseNUMBERseqs='./phraseINlinenb.txt'
+
+## preprocess("preprocess-pos.txt")
+## segANDpos("preprocess-pos.txt")
+## reformPARSED('pos_parsednb.txt',parsedFILE)
+
+
+
+ sentiDICT = {}
+ #loadSENTI('./sentiment2.txt')
+ loadSENTI('./mySTRENGTH.txt') ### sync two files
+ findPHRASE(taggedFILE,parsedFILE,phraseFILE)
+ filterPHRASE(phraseFILE,finalPHRASE)
+ nonLINEAR = loadLEXICON('./nonlinear.txt')
+ calALL(nonLINEAR,'advxxx.txt',finalPHRASE,phraseNUMBERseqs)
+
+
+ ##### apply count method
+ errorLIST = statistics(phraseNUMBERseqs)
+
+
+ ###add new function to count the number of pos/neg
+ statistics2(phraseNUMBERseqs)
+ #writeERROR('preprocess-neg.txt',errorLIST)
+ recordOOV(oov)
+ print 'finished',time.asctime()
+
+
+
+
+
+
+
61 cal_ngd.py
View
@@ -0,0 +1,61 @@
+import math,time
+print "starts",time.asctime()
+
+fo2=open("./single.txt")
+fo3=open('./merge.txt')
+fw = open("./ngd.txt",'w')
+
+dict = {}
+list=[]
+for line in fo2:
+ line=line.strip()
+ parts=line.split()
+ dict[parts[0]]=parts[1]
+ list.append(int(parts[1]))
+list.sort()
+print "the max indexed:",list[-1]
+print "there are %s query items " % len(dict)
+
+
+N=10000000000
+## 1204070806
+cnt=0
+total=0
+lost = set()
+for line in fo3:
+ line=line.strip()
+ parts=line.split()
+ total+=1
+ a,b= parts[0],parts[1]
+ try:
+ Fx=int(dict.get(a));
+ except:
+ #print "miss a",a
+ lost.add(a)
+ try:
+ Fy=int(dict.get(b));
+ except:
+ #print "miss b",b
+ lost.add(b)
+ try:
+ Fxy=int(parts[2])
+ except:
+ print "miss fxy::",line
+ logx=math.log(Fx);logy=math.log(Fy);logxy=math.log(Fxy)
+ ngd=(max(logx,logy)-logxy) / float(math.log(N)-min(logx,logy))
+ if ngd >1:
+ cnt+=1
+ ngd= 1 ##### ??
+ fw.write(' '.join(parts[0:2])+'---'+str(ngd)+"\n")
+
+print "times that ngd bigger than one:",cnt
+print "the radio is :", float(cnt)/total
+fw.close()
+
+for i in lost:
+ print i
+
+print "end.:",time.asctime()
+
+
+
83 cal_strength.py
View
@@ -0,0 +1,83 @@
+import networkx as nx
+import math
+
+#####
+'''there be unicode / utf8 '''
+#####
+
+def loadSENTI(path):
+ fo = open(path)
+ sentiDICT = {}
+ for line in fo:
+ line =line.strip()
+ if line:
+ li= line.split()
+ if len(li)==2:
+ try:
+ sentiDICT[li[0]]= float(li[1])
+ except:
+ print "type error, not number",line
+ print "Length of sentiment lexion in %s is %s " %(fo.name,len(sentiDICT))
+ return sentiDICT
+
+DG=nx.read_gpickle("paper.gpickle")
+nn= DG.number_of_nodes()
+print "the number of nodes:",nn
+
+cnt_sole=0
+for n in DG:
+ if not DG.predecessors(n):
+ cnt_sole+=1
+print "sole nodes:",cnt_sole
+
+#init the strength
+senti = loadSENTI('./sentiment2.txt')
+for n in DG:
+ DG.node[n]['s']=senti.get(n.encode('utf8'))
+
+#iter
+for i in xrange(100):
+ li = []
+ for n in DG:
+ if DG.in_degree(n)==0:
+ DG.node[n]['s1']=DG.node[n]['s']
+ else:
+ DG.node[n]['s1']=0
+ neighbors=DG.predecessors(n)
+ for nb in neighbors:
+ try:
+ w=DG[nb][n]['weight'] / float(DG.out_degree(nb,weight='weight'))
+ if senti.get(n.encode('utf8'))*senti.get(nb.encode('utf8'))<0:
+ w*=-1
+ DG.node[n]['s1']+= w*DG.node[nb]['s']
+ except:
+ print n.encode('utf8'),nb.encode('utf8')
+ ## no ngd
+ try:
+ DG.node[n]['s1']+=DG.node[n]['s']
+ except:
+ pass
+ li.append(DG.node[n]['s1'])
+
+ #normalize the strength: method
+ li.sort(reverse=True)
+ a = []
+ print "the mormalized factor :",li[0]
+ for n in DG:
+ #DG.node[n]['s1']=DG.node[n]['s1']/math.sqrt(sums)
+ DG.node[n]['s1']=DG.node[n]['s1']/li[0]
+ try:
+ a.append(abs(DG.node[n]['s1']-DG.node[n]['s']))
+ except:
+ pass
+ DG.node[n]['s']=DG.node[n]['s1']
+
+ a.sort(reverse=True)
+ if a[0] < 0.02:
+ print "iter %s th stop" % i
+ break
+
+fw=open("mySTRENGTH.txt","w")
+for n in DG:
+ fw.write(n.encode('utf8')+" "+str(DG.node[n]['s'])+"\n")
+fw.close()
126 check.py
View
@@ -2,6 +2,7 @@
import re
import os
from shutil import copyfile
+from preprocess import *
'''tools'''
def checkoutPHRASE(path):
@@ -149,9 +150,9 @@ def checkLABELED(path1,path2):
if len(kv)==2:
try:
test = float(kv[1])
+ dict1[kv[0]]=kv[1]
except:
- print kv[0]
- dict1[kv[0]]=kv[1]
+ print kv[0],"value not float."
print "dict1 length is %s" %len(dict1)
@@ -162,10 +163,9 @@ def checkLABELED(path1,path2):
if len(kv)==2:
try:
test = float(kv[1])
+ dict2[kv[0]]=kv[1]
except:
print kv[0]
-
- dict2[kv[0]]=kv[1]
print "dict2 length is %s" %len(dict2)
a = set(dict1.keys())
@@ -202,6 +202,72 @@ def checkLABELED(path1,path2):
fw.close()
+def adjust(path1,path2):
+ dict1={}
+ dict2={}
+ fo1 = open(path1) #'./1.txt'
+ fo2 = open(path2) #'./2.txt'
+ fw=open('./senti2.txt','w')
+ for line in fo1:
+ line=line.strip()
+ if line:
+ kv=line.split()
+ if len(kv)==2:
+ try:
+ test = float(kv[1])
+ dict1[kv[0]]=kv[1]
+ except:
+ print kv[0],"value not float."
+ print "dict1 length is %s" %len(dict1)
+
+
+ for line in fo2:
+ line=line.strip()
+ if line:
+ kv=line.split()
+ if len(kv)==2:
+ try:
+ test = float(kv[1])
+ dict2[kv[0]]=kv[1]
+ except:
+ print kv[0]
+ print "dict2 length is %s" %len(dict2)
+
+
+ for i in dict1.keys():
+
+ if dict1.get(i)==dict2.get(i):
+ continue
+ else:
+ if dict1.get(i) and dict2.get(i):
+ if abs(float(dict1.get(i))-float(dict2.get(i)))<=1:
+ avg= (float(dict1.get(i))+float(dict2.get(i)))/2.0
+ dict1[i]=str(avg)
+ else:
+ print i," ",dict1.get(i)," ",dict2.get(i)
+ r=raw_input("type a value:")
+ dict1[i]=r
+
+ for i in dict1.keys():
+ fw.write(i+" "+dict1.get(i)+"\n")
+ fw.close()
+
+import re
+hanLAB = re.compile(ur'[\u4e00-\u9fa5]+ -?[\d.]+')
+def splitLABEL(path,path1):
+ fo = open(path)
+ fw = open(path1,'w')
+ for line in fo:
+ line=line.strip()
+ li = hanLAB.findall(line.decode('utf8'))
+ if li:
+ print len(li)
+ for j in li:
+ fw.write(j.encode('utf8')+"\n")
+ fw.close()
+
+
+
def doOOV():
fo1 = open('./oov.txt')
fo2 = open('./oov1.txt')
@@ -386,8 +452,9 @@ def loadASPECTsenti(path):
dic[' '.join(li[0:2])]=li[2]
return dic
-sen = re.compile(ur'\u3002|\uff0e|\uff01|\uff1f|\?|!|\.')
+###define a regx
+sen = re.compile(ur'\u3002|\uff0e|\uff01|\uff1f|\?|!|\.')
# split more than 80 tokens
def splitLONG(i):
sentenceLI= []
@@ -404,8 +471,8 @@ def splitLONG(i):
if len(k.split())<81:
sentenceLI.append(k)
else:
- print "can't split this one:",k
- r=raw_input("input man made sentences:")
+ print "this sentence can't be split:\n",k
+ r=raw_input("input man-made sentence:")
list2 = r.split('\\n')
for l in list2:
sentenceLI.append(l)
@@ -422,7 +489,7 @@ def getSENTENCE(path):
if line=='-- -- -- -- --':
fw.write(line+'\n')
else:
- li = sen.split(line.decode('utf8'))
+ li = sen.split(line.decode('utf8')) #### split the seged file
for i in li:
i = i.strip().encode('utf8')
if i:
@@ -431,7 +498,8 @@ def getSENTENCE(path):
else:
list2 = splitLONG(i)
for s in list2:
- fw.write(s+'\n')
+ if s:
+ fw.write(s+'\n')
fw.close()
copyfile('sentences.txt',path+'.backup')
os.rename('sentences.txt',path)
@@ -472,6 +540,38 @@ def reformPARSED(path,path2):
fw.write(' '.join(li)+'\n')
li = []
fw.close()
+
+
+def checkDICT(path,path2):
+ a = file2set(path)
+ print len(a)
+
+ b = file2set(path2)
+ print len(b)
+ with open(path2) as fo:
+ for line in fo:
+ line=line.strip()
+ if line and (line not in a):
+ print line
+
+def diffTWOfile(path1,path2):
+ a = set()
+ b= set()
+ with open(path1) as org,open(path2) as dec:
+ for line in org:
+ if line:
+ if line.strip().split()[0] in a:
+ print line.strip().split()[0]
+ a.add(line.strip().split()[0])
+
+ for line in dec:
+ if line:
+ b.add(line.strip().split()[0])
+ print len(a),len(b)
+ c= a-b
+ for i in c:
+ print i
+
if __name__ == '__main__':
## processADVSS('./advss.txt')
@@ -483,6 +583,10 @@ def reformPARSED(path,path2):
#print loadASPECTsenti('./aspectDICT.txt')
## getSENTENCE('./neg_seged.txt')
## statSENTENCES('./neg_seged.txt')
- statPARSED('./neg_parsed.txt')
- reformPARSED('neg_parsed.txt','neg_parsed_format.txt')
+## statPARSED('./pos_parsed.txt')
+## reformPARSED('pos_parsed.txt','pos_parsed_format.txt')
+ #checkDICT('positive.txt','NTUSD_positive_simplified.txt')
+ #adjust('sentiment.txt','lexicon2.txt')
+ #splitLABEL("senti2.txt","senti3.txt")
+ diffTWOfile("sentiment2.txt","mySTRENGTH.txt")
pass
3  evaluate.py
View
@@ -27,8 +27,7 @@ def findSENTIdroppoint(sentence):
try:
return float(li[0])
except:
- return 0
- #print li
+ return 0
begin = float(li[0]);end = float(li[-1])
if abs(begin)>abs(end):
return begin
26 fitMERGE.py
View
@@ -0,0 +1,26 @@
+import math,time
+print "starts",time.asctime()
+
+
+fo3=open('./merge.txt')
+fw = open("./merge1.txt",'w')
+
+total = 0
+for line in fo3:
+ line=line.strip()
+ if line:
+ if line.find("+")!=-1:
+ print line
+ total+=1
+ else:
+ fw.write(line+"\n")
+
+
+fw.close()
+
+
+
+print "end.:",time.asctime()
+
+
+
95 graph.py
View
@@ -0,0 +1,95 @@
+import networkx as nx
+import matplotlib.pyplot as plt
+### create the graph
+
+DG=nx.DiGraph()
+fo = open('./ngd.txt')
+dict = {}
+for line in fo:
+ line=line.strip()
+ parts=line.split('---')
+ dict[parts[0]]=parts[1]
+print len(dict)
+###fo.seek(0)
+# file cursor goes to the begin
+
+## load sentiment lexicon
+######
+with open('./sentiment2.txt') as fo1:
+ lexicon = {}
+ for line in fo1:
+ line = line.strip()
+ if line:
+ kv= line.split()
+ if len(kv)==2:
+ lexicon[kv[0]]=kv[1]
+
+def loadSENTI(path):
+ fo = open(path)
+ sentiDICT = {}
+ for line in fo:
+ line =line.strip()
+ if line:
+ li= line.split()
+ if len(li)==2:
+ try:
+ sentiDICT[li[0]]= float(li[1])
+ except:
+ print "type error, not number",line
+ print "Length of sentiment lexion in %s is %s " %(fo.name,len(sentiDICT))
+ return sentiDICT
+
+
+'''
+ the construction of the graph,not very right @@@####
+'''
+#**************** change it *********!!!!!
+### the phrase2 to change it !!!
+#with open('phrase2.txt') as fp:
+cnt1 =0
+### the phrase12 is the merge of phrase and phrase2
+with open('phrase12.txt') as fp:
+ li = []
+ todo= set()
+ for line in fp:
+ line=line.strip()
+ if line:
+ if line!='----------':
+ li.append(line)
+ else:
+ ## ser phrases of a sentence
+ for i in range(len(li)-1):
+ if all(x==1 for x in (len(li[i].split()),len(li[i+1].split()))) and all(x!='SUM' and x!='-4' for x in(li[i],li[i+1])):
+ fir,sec=li[i],li[i+1]
+ if fir == sec:
+ continue
+ if dict.get(' '.join(li[i:i+2])):
+ ngd = float(dict.get(' '.join(li[i:i+2])))
+ elif dict.get(' '.join(li[i:i+2])) is None:
+ try:
+ ngd = float(dict.get(li[i+1]+' '+li[i]))
+ except:
+ #print li[i],' ',li[i+1],"miss it"
+ todo.add(' '.join(li[i:i+2]))
+ cnt1+=1
+ association= 1 - ngd
+ DG.add_edge(fir.decode('utf8'),sec.decode('utf8'),weight=association)
+ DG.add_edge(sec.decode('utf8'),fir.decode('utf8'),weight=association)
+ li = []
+
+
+ already = set()
+ for n in DG:
+ already.add(n.encode('utf8'))
+
+ for i in lexicon.keys():
+ if i not in already:
+ DG.add_node(i.decode('utf8'))
+print "oov length of NGD:",len(todo)
+
+print "cnt of graph edges is ",cnt1
+fw = open("todo.txt",'w')
+for i in todo:
+ fw.write(i+'\n')
+fw.close()
+nx.write_gpickle(DG,"paper.gpickle")
21 neg.txt
View
@@ -17,7 +17,6 @@
手印
厚重
害我
-突兀
忽悠
无法
@@ -352,7 +351,6 @@
繁难
魂不附体
张狂
-象征性
喝斥
画饼充饥
心灰意懒
@@ -400,8 +398,6 @@
柔肠百结
得意忘形
暴跳如雷
-大跌眼镜
-表示坏情感
不名一文
反对称
乱虐并生
@@ -951,7 +947,6 @@
效率很差
哩溜歪斜
平淡无味
-包涵
素不相识
二手
低贱
@@ -1918,7 +1913,6 @@
寒碜
不衫不履
衣衫不整
-依稀
平平淡淡
祸不单行
凄苦
@@ -2418,7 +2412,6 @@
将就
哑巴
-退房
犹犹豫豫
穷奢极侈
恍恍惚惚
@@ -3102,7 +3095,6 @@
弥天
一星
二星
-浓
七零八落
花里胡梢
讥刺
@@ -3168,7 +3160,6 @@
一下子爆发
一下子爆发的一连串
一巴掌
-一再
一再叮嘱
一拳
一般杀人罪
@@ -3179,11 +3170,9 @@
一掴
一蹶不振
人事不省
-人为
人为的
入迷
入迷的人
-刀刃
刁难
刻苦
力尽
@@ -3643,7 +3632,6 @@
不规则
不规则的
不规则的事物
-不许
不胜任
不胜任的
不喜欢
@@ -4742,7 +4730,6 @@
没有经验的
没有观察到
没有观察到的
-没到
没受
没受教育
没受教育的
@@ -4786,7 +4773,6 @@
赤字
赤贫
赤贫者
-赤裸裸
赤裸裸的
走失
走失的
@@ -8416,7 +8402,6 @@
离婚
离乡
离乡背井
-离开
离开正道
离开正道的
离间
@@ -8624,7 +8609,6 @@
权宜的
聋的
听不清
-听任
袭取
袭击
颤抖
@@ -9123,7 +9107,6 @@
劣根哭
劣根性
吐一口痰
-吐痰
吃不下
吃屎
吃饱太闲
@@ -9239,7 +9222,6 @@
老谋深算
肉麻
肉脚
-自以为
自以为了不起
自以为是
自以为清高
@@ -10605,18 +10587,15 @@ HOLD不住
叉腰肌
-毛边
花屏
退货
霸王
-套
滞后
傻比
国产机
-晃悠
空空
3  oov.txt
View
@@ -1,4 +1,3 @@
-如期
有够
K
-细小
+动手
BIN  paper.gpickle
View
Binary file not shown
22,571 phrase.txt
View
10,823 additions, 11,748 deletions not shown
27,579 phrase2.txt
View
13,450 additions, 14,129 deletions not shown
3,960 phraseINline.txt
View
1,980 additions, 1,980 deletions not shown
3,956 phraseINline2.txt
View
1,978 additions, 1,978 deletions not shown
10 pos.txt
View
@@ -504,7 +504,6 @@
过硬
清幽幽
醇浓
-照旧
波光粼粼
一级
淡泊
@@ -876,7 +875,6 @@
忠信
妍丽
无害
-风光
清口
高质量
一刀两断
@@ -927,7 +925,6 @@
一见钟情
止于至善
纯粹
-按时
人和
有头有尾
高性能
@@ -1095,7 +1092,6 @@
自不待言
阳刚
舒缓
-响应
念念不忘
四平八稳
@@ -3160,8 +3156,6 @@
价值千金
合口
威武
-蔫不出溜
-真真
明净
悲喜交集
魅人
@@ -3740,7 +3734,6 @@
光荣的
光辉
光辉的
-光泽
先见之明
全力
全力的
@@ -3755,7 +3748,6 @@
同情心
同意
同意给予
-同感
名声
名誉
合乎
@@ -4938,7 +4930,6 @@
无罪的
无过失
无过失的
-无疑
无疑的
无疑问
无疑问的
@@ -5219,7 +5210,6 @@
满意
满意地说
满意的
-满怀
满怀敬畏
满怀敬畏的
尽情
22,482 pos_phrase.txt
View
10,780 additions, 11,702 deletions not shown
15,767 pos_tagged.txt
View
7,729 additions, 8,038 deletions not shown
8,782 preprocess-neg.txt
View
4,336 additions, 4,446 deletions not shown
6,734 preprocess-pos.txt
View
3,320 additions, 3,414 deletions not shown
56 preprocess.py
View
@@ -11,19 +11,10 @@
quote2 = re.compile(ur'\u201c.+?\u201d')
period = re.compile(ur'\u3002{2,}')
han = re.compile(ur'[\u4e00-\u9fa5]+')
-tag = re.compile( '#\w{1,3}')
-rmword = re.compile( '\w{1,3}')
+tag = re.compile('#\w{1,3}')
+rmword = re.compile('\w{1,3}')
-def rmBLANK(path,writeTO):
- fo = open(path)
- fw = open(writeTO,'w')
- for line in fo:
- line = line.strip()
- if line:
- fw.write(line+'\n')
- fw.close()
-
-def readPHRASE(path):
+def extractADV(path):
fo = open(path)
fw=open('./maybeADV.txt','w')
li=[]
@@ -85,9 +76,8 @@ def preprocess(path):
line = applyPAT(period,line,1,' 无语 ')
## remove intensional verb and something unsure
- lineCOPY = line
+ lineCOPY = line
lineCOPY = lineCOPY.replace('','\n').replace(',','\n').replace('','\n')
- ## sentence split issue
clauses = lineCOPY.split('\n')
for i in clauses:
for j in ivLIST:
@@ -112,12 +102,12 @@ def segANDpos(input):
subprocess.call("./tagger.sh "+arg1, shell=True)
print "pos tagger finished."
-def parseLINE(line):
+'''def parseLINE(line):
p = re.compile( '#\w{1,3}')
fw=open('/home/googcheng/parser/line.txt','w')
fw.write(p.sub('',line))
fw.close()
- subprocess.call("./parse.sh", shell=True)
+ subprocess.call("./parse.sh", shell=True)'''
def sentiment():
dict_list=[]
@@ -126,14 +116,21 @@ def sentiment():
fo2 = open('./pos.txt')
for line in fo1:
line=line.strip()
- if line not in exclude:
+ if line:
dict_list.append(line)
for line in fo2:
line=line.strip()
- if line not in exclude:
+ if line:
dict_list.append(line)
- print "there is %s words in sentiment dictionary" % len({}.fromkeys(dict_list,1))
+ print "prior length:",len(dict_list)
+ for i in exclude:
+ try:
+ dict_list.remove(i)
+ except:
+ pass
+ print "there is %s words in pos&&neg dictionary" % len({}.fromkeys(dict_list,1))
+ print 'test.point:',{}.fromkeys(dict_list,1).get('K')
return {}.fromkeys(dict_list,1)
def getLABEL(element):
@@ -159,15 +156,15 @@ def doNO(ylist,string,i,phraseLIST,dict):
if len(pair)==2:
pair.remove(key)
if not dict.get(pair[0]):
- phraseLIST.append('-没有');lb=i
+ #phraseLIST.append('-没有');lb=i
+ phraseLIST.append('没有');lb=i
else:
## not adjacent
- #print ele
return ele.split(',')[1][:-1]
def findPHRASE(taggedFILE,parsedFILE,phraseFILE):
dict = sentiment()
- advSET = file2set('./sentiADV.txt') ##read sentiment words which act as advs
+ #advSET = file2set('./sentiADV.txt') ##read sentiment words which act as advs
nnSET = file2set('./sentiNN.txt');vvSET=file2set('./sentiVV.txt')
adSET = file2set('./sentiAD.txt')
sumLIST = file2list('./summary.txt')
@@ -295,15 +292,17 @@ def findPHRASE(taggedFILE,parsedFILE,phraseFILE):
if aspect.get(' '.join(pair)):
phraseLIST.append(aspect.get(' '.join(pair)));lb=i
else:
- print "aspect miss:",' '.join(pair)
+ ### default
+ phraseLIST.append(list[i])
if label=="CD":
phraseLIST.append(list[i]);lb=i
else:
if label=='VV':
+ ### to do .......
try:
if ''.join(list[i-3:i])=='不#AD会#VV再#AD':
- phraseLIST.append('-4');lb=i
+ phraseLIST.append('-4');lb=i ### add a const
except:
pass
if seger=='':
@@ -311,7 +310,7 @@ def findPHRASE(taggedFILE,parsedFILE,phraseFILE):
if ele:
ele2 = ele.split(',')[0][4:]
farSENTI2.append(ele2)
- ## must factored,,wait to effect
+ ## must factored
for p in phraseLIST:
if tag.sub('',p) in farSENTI:
fw.write('shift '+p.split('-')[0]+'\n')
@@ -340,7 +339,9 @@ def filterPHRASE(phraseFILE,filteredFILE):
else:
li= line.split('#')
if len(li)==1:
- print "smth error in filter"
+ #print "smth error in filter"
+ #print ' '.join(li)
+ fw.write(li[0]+'\n')
elif len(li) ==2:
fw.write(li[0]+'\n')
@@ -397,7 +398,8 @@ def filterPHRASE(phraseFILE,filteredFILE):
if __name__ == '__main__':
#preprocess("preprocess-neg.txt")
- findPHRASE('neg_tagged.txt','neg_parsed_format.txt','neg_phrase.txt')
+ extractADV('phrase2.txt')
+ #findPHRASE('neg_tagged.txt','neg_parsed_format.txt','neg_phrase.txt')
4 tagger.sh
View
@@ -1,6 +1,6 @@
-mv $1_seged.txt /home/googcheng/pos
+mv $1_seged.txt ~/pos
cd ~/pos
./stanford-postagger.sh models/chinese-nodistsim.tagger $1_seged.txt > $1_tagged.txt
-mv $1_tagged.txt ~/py/paper2/
+cp $1_tagged.txt ~/py/paper2/
101 themain.py
View
@@ -1,3 +1,4 @@
+# -*- coding:utf-8 -*-
from preprocess import *
from evaluate import *
from check import *
@@ -27,15 +28,16 @@ def loadLEXICON(path):
try:
lexicon[' '.join(li[0:-1])]= float(li[-1])
except:
- print "type error, not number",line
+ print "type error, not number",line
print "Length of sentiment lexion in %s is %s " %(fo.name,len(lexicon))
return lexicon
oov= set()
def calPHRASEstrength(nonLINEAR,phrase,advDICT):
+ ### return none if oov
if not phrase:
- return 0
+ return 0 ################# smth ???? should return none
li = phrase.split()
if len(li) ==1:
strength= sentiDICT.get(li[0])
@@ -48,12 +50,13 @@ def calPHRASEstrength(nonLINEAR,phrase,advDICT):
if strength is None:
oov.add(li[1])
strength = 0
-## if li[0] in ['不','shift','没','没有']: ## shift
-## strength = strength - 4 if strength>0 else strength + 4
- if advDICT.get(li[0]):
- strength*= advDICT.get(li[0])
+ if li[0] == 'shift' and strength:
+ strength = strength - 4 if strength>0 else strength + 4
elif li[0]=="不太" and strength:
strength = strength - 5 if strength>0 else strength + 5
+ elif advDICT.get(li[0]):
+ strength*= advDICT.get(li[0])
+
elif len(li)==3:
strength= sentiDICT.get(li[2])
@@ -81,9 +84,10 @@ def calPHRASEstrength(nonLINEAR,phrase,advDICT):
for i in range(length-2,-1,-1):
if advDICT.get(li[i]):
strength*=advDICT.get(li[i])
-## if strength < 0:
-## strength = strength*1.5
-## if mine,comment two lines above
+ if strength < 0:
+ strength = strength*1.5
+
+## if droppoint,comment two lines above
return strength
@@ -120,20 +124,34 @@ def calALL(nonLINEAR,advDICTfilePATH,inputPATH,outputPATH):
list=[]
fw.close()
-
def statistics(phraseNUMBERseqs):
errorLIST = []
dict ={1:0,0:0,-1:0}
with open(phraseNUMBERseqs) as myFILE:
for num, line in enumerate(myFILE, 1):
line=line.strip()
- strength = findSENTIdroppoint(line)
- #strength = commonSENTI(line)
- if strength > 0:
- errorLIST.append(num)
+ #strength = findSENTIdroppoint(line)
+ strength = commonSENTI(line)
+## if strength1 * strength2 > 0:
+## strength = strength2
+## elif strength1 == 0:
+## strength = strength2
+## elif strength2 == 0:
+## strength = strength1
+## else:
+##
+## if strength1 >0 and strength2 < 0: #######
+## strength = strength1
+## elif strength1 < 0 and strength2 > 0:
+## strength = strength2
+## else:
+## strength = (strength1+strength2)/2
+ #####strength = strength2
+## if strength > 0:
+## errorLIST.append(num)
dict[calORIENTATION(strength)]+=1
print dict
- print "the correct percentage is %s" %(dict[-1]/2000.0)
+ print "the correct percentage is %s" %(dict[1]/2000.0)
return errorLIST
if __name__ == '__main__':
@@ -141,22 +159,54 @@ def statistics(phraseNUMBERseqs):
print '''
**notice : the preprocess 163 line , if segmenter is changed!
'''
- taggedFILE='./neg_tagged.txt'
- phraseFILE='./neg_phrase.txt'
- finalPHRASE='./phrase2.txt'
- phraseNUMBERseqs='./phraseINline2.txt'
+## taggedFILE='./neg_tagged.txt'
+## phraseFILE='./neg_phrase.txt'
+## parsedFILE='./neg_parsed_format.txt'
+## finalPHRASE='./phrase2.txt'
+## phraseNUMBERseqs='./phraseINline2.txt'
+##
+## preprocess("preprocess-neg.txt")
+## segANDpos("preprocess-neg.txt")
+## reformPARSED('neg_parsed.txt',parsedFILE)
-## taggedFILE='./pos_tagged.txt'
-## phraseFILE='./pos_phrase.txt'
-## finalPHRASE='./phrase.txt'
-## phraseNUMBERseqs='./phraseINline.txt'
+ taggedFILE='./pos_tagged.txt'
+ phraseFILE='./pos_phrase.txt'
+ parsedFILE='./pos_parsed_format.txt'
+ finalPHRASE='./phrase.txt'
+ phraseNUMBERseqs='./phraseINline.txt'
+
+## preprocess("preprocess-pos.txt")
+## segANDpos("preprocess-pos.txt")
+## reformPARSED('pos_parsed.txt',parsedFILE)
+
+
+ ''' '''
+ ### notebook block
+## taggedFILE='./neg_tagged.txt'
+## phraseFILE='./neg_phrasenb.txt'
+## parsedFILE='./neg_parsed_formatnb.txt'
+## finalPHRASE='./phrase2nb.txt'
+## phraseNUMBERseqs='./phraseINline2nb.txt'
+
## preprocess("preprocess-neg.txt")
## segANDpos("preprocess-neg.txt")
+## reformPARSED('neg_parsednb.txt',parsedFILE)
+
+## taggedFILE='./pos_tagged.txt'
+## phraseFILE='./pos_phrasenb.txt'
+## parsedFILE='./pos_parsed_formatnb.txt'
+## finalPHRASE='./phrasenb.txt'
+## phraseNUMBERseqs='./phraseINlinenb.txt'
+
+## preprocess("preprocess-pos.txt")
+## segANDpos("preprocess-pos.txt")
+## reformPARSED('pos_parsednb.txt',parsedFILE)
sentiDICT = {}
- loadSENTI('./sentiment.txt')
- findPHRASE('neg_tagged.txt','neg_parsed_format.txt','neg_phrase.txt')
+ #loadSENTI('./sentiment2.txt')
+ loadSENTI('./mySTRENGTH.txt') ### sync two files
+ findPHRASE(taggedFILE,parsedFILE,phraseFILE)
filterPHRASE(phraseFILE,finalPHRASE)
nonLINEAR = loadLEXICON('./nonlinear.txt')
calALL(nonLINEAR,'advxxx.txt',finalPHRASE,phraseNUMBERseqs)
@@ -164,7 +214,6 @@ def statistics(phraseNUMBERseqs):
#writeERROR('preprocess-neg.txt',errorLIST)
recordOOV(oov)
print 'finished',time.asctime()
-
Please sign in to comment.
Something went wrong with that request. Please try again.