In [1]:
import feedparser
import re

feedlist=['http://today.reuters.com/rss/topNews',
          'http://today.reuters.com/rss/domesticNews',
          'http://today.reuters.com/rss/worldNews',
          'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
          'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
          'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
          'http://news.google.com/?output=rss',
          'http://feeds.salon.com/salon/news',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
          'http://rss.cnn.com/rss/edition.rss',
          'http://rss.cnn.com/rss/edition_world.rss',
          'http://rss.cnn.com/rss/edition_us.rss']

In [2]:
def stripHTML(h):
    p=''
    s=0
    
    for c in h:
        if c=='<':
            s=1
        elif c == '>':
            s=0
            p+=' '
        elif s==0:
            p+=c
    
    return p

In [3]:
def separatewords(text):
    splitter = re.compile('\\W*')
    return [s.lower() for s in splitter.split(text) if len(s)>3]

In [4]:
def getarticlewords():
    allwords = {}
    articlewords = []
    articletitles = []
    ec = 0
    
    for feed in feedlist:
        f = feedparser.parse(feed)
        
        for e in f.entries:
            if e.title in articletitles:
                continue
            
            txt = e.title.encode('utf8')+stripHTML(e.description.encode('utf8'))
            words = separatewords(txt)
            articlewords.append({})
            articletitles.append(e.title)
            
            for word in words:
                allwords.setdefault(word , 0)
                allwords[word] += 1
                articlewords[ec].setdefault(word , 0)
                articlewords[ec][word] += 1
            ec += 1
            
    return allwords  , articlewords , articletitles

In [5]:
#构造成矩阵
def makematrix(allw , articlew):
    wordvec = []
    
    for w , c in allw.items():
        if c>3 and c<len(articlew)*0.6:
            wordvec.append(w)
            
    l1 = [[(word in f and f[word] or 0) for word in wordvec] 
         for f in articlew]
    
    return l1 , wordvec

In [6]:
allw , artw , artt = getarticlewords()

In [9]:
wordmatrix , wordvec = makematrix(allw , artw)

In [12]:
artt

[u"North Korea warns of 'abyss of doom' if 'old lunatic' Trump remains president",
 u'Brazile says CNN\u20ac&trade;s Tapper betrayed her after debate flap',
 u'Nasty notes that spurred Air Force general\u20ac&trade;s lecture written by &acirc;\u20ac&tilde;victim&acirc;\u20ac&trade;',
 u'Woman killed \u20ac&tilde;sugar daddy&acirc;\u20ac&trade; when relationship soured: prosecutors',
 u'Exclusive: Undercover teams record grisly remote Faroe Islands whale slaughter',
 u'Wrestler had 15 drinks a day, slept with 10,000 women, documentary reveals',
 u'What critter has been found hidden in Van Gogh painting?',
 u"Germany must legally recognize 'third gender' from birth, top court rules",
 u'Lightning strike instantly kills two surfers',
 u"'Super mom' Sherri Papini filmed running to safety the morning she was found",
 u'Ex-news anchor whose reporter girlfriend was killed on live TV wins election',
 u"Tom Fitton: Mueller probe is 'fruit of the poison tree,' should be shut down",
 u'Pence resp

In [13]:
wordmatrix

[[0],
 [1],
 [2],
 [1],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0]]

In [6]:
def wordmatrixfeatures(x):
    return [wordvec[w] for w in range(len(x)) if x[w]>0]

In [7]:
from numpy import *

In [8]:
def difcost(a , b):
    dif = 0
    
    for i in range(shape(a)[0]):
        for j in range(shape(a)[1]):
            dif += pow(a[i,j]-b[i,j] , 2)
    
    return dif

In [9]:
def factorize(v , pc=10 , iter_ = 50):
    ic = shape(v)[0]
    fc = shape(v)[1]
    
    w = matrix([[random.random() for j in range(pc)] for i in range(ic)])
    h = matrix([[random.random() for i in range(fc)] for i in range(pc)])
    
    for i in range(iter_):
        wh = w*h
        
        cost = difcost(v , wh)
        
        if i%10 == 0:
            print(cost)
        
        if cost == 0:
            break
        
        hn = (transpose(w)*v)
        hd = (transpose(w)*w*h)
        
        h = matrix(array(h)*array(hn)/array(hd))
        
        wn = (v*transpose(h))
        wd = (w*h*transpose(h))
        
        w = matrix(array(w)*array(wn)/array(wd))
    
    return w , h

In [20]:
l1 = [[1,2,3],[4,5,6]]
m1 = matrix(l1)

m2=matrix([[1,2],[3,4],[5,6]])


In [21]:
w , h = factorize(m1*m2 , pc = 3 , iter_=100)

7516.381880715774
0.2496424756487585
0.1452219990324857
0.08562946465731655
0.050948208734387705
0.03049986879497617
0.018336390474338653
0.011056885032405381
0.006681680512550269
0.004044058972053215


In [27]:
w

matrix([[0.1986195 , 0.04891173, 0.58601606],
        [0.52625237, 0.28558996, 1.07536792]])

In [28]:
h

matrix([[10.21180522,  7.67731926],
        [35.21206604, 58.41777306],
        [31.20221152, 40.2545605 ]])

In [22]:
w*h

matrix([[22.03554401, 27.97199898],
        [48.98405673, 64.01220021]])

In [24]:
m1*m2

matrix([[22, 28],
        [49, 64]])

In [25]:
v = matrix(wordmatrix)

In [26]:
weights , feat = factorize(v , pc = 20 , iter_=50)

699.6976441388856
nan
nan
nan
nan


  


In [10]:
def showfeatures(w , h , titles,  wordvec , out='features.txt'):
    outfile = open(out , 'w')
    pc , wc = shape(h)
    toppatterns = [[] for i in range(len(titles))]
    patternnames = []
    
    for i in range(pc):
        slist = []
        for j in range(wc):
            slist.append((h[i,j] , wordvec[j]))
        
        slist.sort()
        slist.reverse()
        
        n=[s[1] for s in slist[0:6]]
        outfile.write(str(n)+'\n')
        patternnames.append(n)
        
        flist = []
        
        for j in range(len(titles)):
            flist.append((w[j,i] , titles[j]))
            toppatterns[j].append((w[j,i] , i , titles[j]))
        
        flist.sort()
        flist.reverse()
    
        for f in flist[0:3]:
            outfile.write(str(f) + '\n')
        
        outfile.write('\n')
    
    outfile.close()
    
    return toppatterns , patternnames

In [30]:
topp , pn = showfeatures(weights , feat , artt , wordvec)

In [11]:
def showarticles(titles , toppatterns , patternnames , out='articles.txt'):
    outfile = open(out , 'w')
    
    for j in range(len(titles)):
        outfile.write(titles[j].encode('utf8')+'\n')
        
        toppatterns[j].sort()
        toppatterns[j].reverse()
        
        for i  in range(3):
            outfile.write(str(toppatterns[j][i][0])+' '+
                         str(patternnames[toppatterns[j][i][1]])+'\n')
            
        outfile.write('\n')
    
    outfile.close()

In [33]:
showarticles(artt , topp , pn)

In [13]:
import urllib2


In [14]:
tickers = ['YHOO','AVP','BIIB','BP','CL','CVX',
           'DNA','EXPE','GOOG','PG','XOM','AMGN']

shortest = 300
prices = {}
datas = None

In [15]:
for t in tickers:
    rows = urllib2.urlopen('http://ichart.finance.yahoo.com/table.csv?s=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=1996&ignore=.csv'%t).readlines()
    prices[t] = [float(r.split(',')[5]) for r in rows[1:] if r.strip() != '']
    
    if len(prices[t])<shortest:
        shortest = len(prices[t])
    
    if not datas:
        dates = [r.split(',')[0] for r in rows[1:] if r.strip() != '']

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [None]:
l1 = [[prices[tickers[i]][j] for i in range(len(tickers))] 
     for j in range(shortest)]

In [None]:
w , h = factorize(matrix(l1) , pc=5)