Skip to content

Commit

Permalink
updated regression to reflect what is the in the book
Browse files Browse the repository at this point in the history
  • Loading branch information
pbharrin committed Dec 23, 2011
1 parent 522d3a6 commit 53764f2
Showing 1 changed file with 65 additions and 42 deletions.
107 changes: 65 additions & 42 deletions Ch08/regression.py
Expand Up @@ -96,10 +96,10 @@ def stageWise(xArr,yArr,eps=0.01,numIt=100):
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
returnMat = zeros((numIt,n)) #testing code remove
#returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):#could change this to while loop
#print ws.T
for i in range(numIt):
print ws.T
lowestError = inf;
for j in range(n):
for sign in [-1,1]:
Expand All @@ -111,46 +111,69 @@ def stageWise(xArr,yArr,eps=0.01,numIt=100):
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:]=ws.T
return returnMat
#returnMat[i,:]=ws.T
#return returnMat

def scrapePage(inFile,outFile,yr,numPce,origPrc):
from BeautifulSoup import BeautifulSoup
fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
soup = BeautifulSoup(fr.read())
i=1
currentRow = soup.findAll('table', r="%d" % i)
while(len(currentRow)!=0):
currentRow = soup.findAll('table', r="%d" % i)
title = currentRow[0].findAll('a')[1].text
lwrTitle = title.lower()
if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
newFlag = 1.0
else:
newFlag = 0.0
soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
if len(soldUnicde)==0:
print "item #%d did not sell" % i
else:
soldPrice = currentRow[0].findAll('td')[4]
priceStr = soldPrice.text
priceStr = priceStr.replace('$','') #strips out $
priceStr = priceStr.replace(',','') #strips out ,
if len(soldPrice)>1:
priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
print "%s\t%d\t%s" % (priceStr,newFlag,title)
fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
i += 1
currentRow = soup.findAll('table', r="%d" % i)
fw.close()
#def scrapePage(inFile,outFile,yr,numPce,origPrc):
# from BeautifulSoup import BeautifulSoup
# fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
# soup = BeautifulSoup(fr.read())
# i=1
# currentRow = soup.findAll('table', r="%d" % i)
# while(len(currentRow)!=0):
# title = currentRow[0].findAll('a')[1].text
# lwrTitle = title.lower()
# if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
# newFlag = 1.0
# else:
# newFlag = 0.0
# soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
# if len(soldUnicde)==0:
# print "item #%d did not sell" % i
# else:
# soldPrice = currentRow[0].findAll('td')[4]
# priceStr = soldPrice.text
# priceStr = priceStr.replace('$','') #strips out $
# priceStr = priceStr.replace(',','') #strips out ,
# if len(soldPrice)>1:
# priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
# print "%s\t%d\t%s" % (priceStr,newFlag,title)
# fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
# i += 1
# currentRow = soup.findAll('table', r="%d" % i)
# fw.close()

def setDataCollect():
scrapePage('setHtml/lego8288.html','out.txt', 2006, 800, 49.99)
scrapePage('setHtml/lego10030.html','out.txt', 2002, 3096, 269.99)
scrapePage('setHtml/lego10179.html','out.txt', 2007, 5195, 499.99)
scrapePage('setHtml/lego10181.html','out.txt', 2007, 3428, 199.99)
scrapePage('setHtml/lego10189.html','out.txt', 2008, 5922, 299.99)
scrapePage('setHtml/lego10196.html','out.txt', 2009, 3263, 249.99)
from time import sleep
import json
import urllib2
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10)
myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
pg = urllib2.urlopen(searchURL)
retDict = json.loads(pg.read())
for i in range(len(retDict['items'])):
try:
currItem = retDict['items'][i]
if currItem['product']['condition'] == 'new':
newFlag = 1
else: newFlag = 0
listOfInv = currItem['product']['inventories']
for item in listOfInv:
sellingPrice = item['price']
if sellingPrice > origPrc * 0.5:
print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
except: print 'problem with item %d' % i

def setDataCollect(retX, retY):
searchForSet(retX, retY, 8288, 2006, 800, 49.99)
searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
searchForSet(retX, retY, 10196, 2009, 3263, 249.99)

def crossValidation(xArr,yArr,numVal=10):
m = len(yArr)
Expand Down Expand Up @@ -186,4 +209,4 @@ def crossValidation(xArr,yArr,numVal=10):
meanX = mean(xMat,0); varX = var(xMat,0)
unReg = bestWeights/varX
print "the best model from Ridge Regression is:\n",unReg
print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat)
print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat)

0 comments on commit 53764f2

Please sign in to comment.