Skip to content

Commit

Permalink
py3: merge pattern3 fork (thanks @hayd)
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom De Smedt committed May 27, 2015
1 parent 519b7ba commit a34c683
Show file tree
Hide file tree
Showing 33 changed files with 454 additions and 433 deletions.
8 changes: 4 additions & 4 deletions examples/01-web/01-google.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
# Google is very fast but you can only get up to 100 (10x10) results per query.
for i in range(1, 2):
for result in engine.search(q, start=i, count=10, type=SEARCH, cached=True):
print plaintext(result.text) # plaintext() removes all HTML formatting.
print result.url
print result.date
print
print(plaintext(result.text)) # plaintext() removes all HTML formatting.
print(result.url)
print(result.date)
print("")
14 changes: 7 additions & 7 deletions examples/01-web/02-google-translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@

# This example demonstrates the Google Translate API.
# It will only work with a license key, since it is a paid service.
# In the Google API console (https://code.google.com/apis/console/),
# In the Google API console (https://code.google.com/apis/console/),
# activate Translate API.

g = Google(license=None) # Enter your license key.
g = Google(license=None) # Enter your license key.
q = "Your mother was a hamster and your father smelled of elderberries!" # en
# "Ihre Mutter war ein Hamster und euer Vater roch nach Holunderbeeren!" # de
print q
print plaintext(g.translate(q, input="en", output="de")) # fr, de, nl, es, cs, ja, ...
print
print(q)
print(plaintext(g.translate(q, input="en", output="de"))) # es, fr, sv, ja, ...
print("")

q = "C'est un lapin, lapin de bois, un cadeau."
print q
print g.identify(q) # (language, confidence)
print(q)
print(g.identify(q)) # (language, confidence)
18 changes: 9 additions & 9 deletions examples/01-web/03-bing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# The pattern.web module uses a test account by default,
# with 5000 free queries per month shared by all Pattern users.
# If this limit is exceeded, SearchEngineLimitError is raised.
# You should obtain your own license key at:
# You should obtain your own license key at:
# https://datamarket.azure.com/account/
engine = Bing(license=None, language="en")

Expand All @@ -22,7 +22,7 @@
# When you execute a query,
# the script will halt until all results are downloaded.
# In apps with an infinite main loop (e.g., GUI, game),
# it is often more useful if the app keeps on running
# it is often more useful if the app keeps on running
# while the search is executed in the background.
# This can be achieved with the asynchronous() function.
# It takes any function and that function's arguments and keyword arguments:
Expand All @@ -32,19 +32,19 @@
# In real-life you would have an app.update() or similar
# in which you can check request.done every now and then.
while not request.done:
time.sleep(0.01)
print ".",
time.sleep(0.1)
print(".")

print
print
print("")
print("")

# An error occured in engine.search(), raise it.
if request.error:
raise request.error

# Retrieve the list of search results.
for result in request.value:
print result.text
print result.url
print
print(result.text)
print(result.url)
print("")

18 changes: 9 additions & 9 deletions examples/01-web/04-twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
print i
print(i)
for tweet in engine.search("is cooler than", start=prev, count=25, cached=False):
print
print tweet.text
print tweet.author
print tweet.date
print hashtags(tweet.text) # Keywords in tweets start with a "#".
print
print("")
print(tweet.text)
print(tweet.author)
print(tweet.date)
print(hashtags(tweet.text)) # Keywords in tweets start with a "#".
print("")
# Only add the tweet to the table if it doesn't already exists.
if len(table) == 0 or tweet.id not in index:
table.append([tweet.id, tweet.text])
Expand All @@ -44,8 +44,8 @@
# Create a .csv in pattern/examples/01-web/
table.save(pd("cool.csv"))

print "Total results:", len(table)
print
print("Total results: %s" % len(table))
print("")

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
Expand Down
8 changes: 4 additions & 4 deletions examples/01-web/05-twitter-stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@
stream = Twitter().stream("I hate", timeout=30)

#while True:
for i in range(100):
print i
for i in range(10):
print(i)
# Poll Twitter to see if there are new tweets.
stream.update()
# The stream is a list of buffered tweets so far,
# with the latest tweet at the end of the list.
for tweet in reversed(stream):
print tweet.text
print tweet.language
print(tweet.text)
print(tweet.language)
# Clear the buffer every so often.
stream.clear()
# Wait awhile between polls.
Expand Down
12 changes: 6 additions & 6 deletions examples/01-web/06-feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@
engine = Newsfeed()

for result in engine.search(CNN, cached=True):
print result.title.upper()
print plaintext(result.text) # Remove HTML formatting.
print result.url
print result.date
print
print(result.title.upper())
print(plaintext(result.text)) # Remove HTML formatting.
print(result.url)
print(result.date)
print("")

# News item URL's lead to the page with the full article.
# This page can have any kind of formatting.
# There is no default way to read it.
# But we could just download the source HTML and convert it to plain text:

#html = URL(result.url).download()
#print plaintext(html)
#print(plaintext(html))

# The resulting text may contain a lot of garbage.
# A better way is to use a DOM parser to select the HTML elements we want.
Expand Down
24 changes: 12 additions & 12 deletions examples/01-web/07-wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@
# instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)

print article.title # Article title (may differ from the search query).
print
print article.languages["fr"] # Article in French, can be retrieved with Wikipedia(language="fr").
print article.links[:10], "..." # List of linked Wikipedia articles.
print article.external[:5], "..." # List of external URL's.
print
print(article.title) # Article title (may differ from the search query).
print("")
print(article.languages["fr"]) # Article in French, can be retrieved with Wikipedia(language="fr").
print(article.links[:10]) # List of linked Wikipedia articles.
print(article.external[:5]) # List of external URL's.
print("")

#print article.source # The full article content as HTML.
#print article.string # The full article content, plain text with HTML tags stripped.
#print(article.source) # The full article content as HTML.
#print(article.string) # The full article content, plain text with HTML tags stripped.

# An article is made up of different sections with a title.
# WikipediaArticle.sections is a list of WikipediaSection objects.
# Each section has a title + content and can have a linked parent section or child sections.
for s in article.sections:
print s.title.upper()
print
print s.content # = ArticleSection.string, minus the title.
print
print(s.title.upper())
print("")
print(s.content) # = ArticleSection.string, minus the title.
print("")

14 changes: 7 additions & 7 deletions examples/01-web/08-wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# The classifier is small (80KB) and fast.

w = Wiktionary(language="en")
f = csv() # csv() is a short alias for Datasheet().
f = csv() # csv() is a short alias for Datasheet().

# Collect male and female given names from Wiktionary.
# Store the data as (name, gender)-rows in a CSV-file.
Expand All @@ -22,7 +22,7 @@
if not name.startswith("Appendix:"):
f.append((name, gender[0]))
f.save(pd("given-names.csv"))
print ch, gender
print(ch, gender)

# Create a classifier that predicts gender based on name.

Expand All @@ -42,15 +42,15 @@ def vector(self, name):
"""
v = chngrams(name, n=2)
v = count(v)
v[name[-2:]+"$"] = 1
v[name[-2:] + "$"] = 1
v[len(name)] = 1
return v

data = csv(pd("given-names.csv"))

# Test average (accuracy, precision, recall, F-score, standard deviation).

print kfoldcv(GenderByName, data, folds=3) # (0.81, 0.79, 0.77, 0.78, 0.00)
print(kfoldcv(GenderByName, data, folds=3)) # (0.81, 0.79, 0.77, 0.78, 0.00)

# Train and save the classifier in the current folder.
# With final=True, discards the original training data (= smaller file).
Expand All @@ -76,13 +76,13 @@ def vector(self, name):
"Leia",
"Flash",
"Barbarella"):
print name, g.classify(name)
print(name, g.classify(name))

# In the example above, Arwen and Jabba are misclassified.
# We can of course improve the classifier by hand:

#g.train("Arwen", gender="f")
#g.train("Jabba", gender="m")
#g.save(pd("gender-by-name.svm"), final=True)
#print g.classify("Arwen")
#print g.classify("Jabba")
#print(g.classify("Arwen"))
#print(g.classify("Jabba"))
16 changes: 8 additions & 8 deletions examples/01-web/09-wikia.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# Wikipedia is based on MediaWiki too.
# Wikia queries request the article HTML source from the server. This can be slow.

domain = "monkeyisland" # "Look behind you, a three-headed monkey!"
domain = "monkeyisland" # "Look behind you, a three-headed monkey!"

# Alternatively, you can call this script from the commandline
# and specify another domain: python 09-wikia.py "Bieberpedia".
Expand All @@ -18,7 +18,7 @@
w = Wikia(domain, language="en")

# Like Wikipedia, we can search for articles by title with Wikia.search():
print w.search("Three Headed Monkey")
print(w.search("Three Headed Monkey"))

# However, we may not know exactly what kind of articles exist,
# three-headed monkey" for example does not redirect to the above article.
Expand All @@ -29,9 +29,9 @@
# Retrieving the full article for each article takes another query. This can be slow.
i = 0
for article in w.articles(count=2, cached=True):
print
print article.title
#print article.plaintext()
print("")
print(article.title)
#print(article.plaintext())
i += 1
if i >= 3:
break
Expand All @@ -40,10 +40,10 @@
# and only retrieve the full articles for the titles we need:
i = 0
for title in w.index(count=2):
print
print title
print("")
print(title)
#article = w.search(title)
#print article.plaintext()
#print(article.plaintext())
i += 1
if i >= 3:
break
34 changes: 17 additions & 17 deletions examples/01-web/10-dbpedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
# DBPedia data is stored as RDF triples: (subject, predicate, object),
# e.g., X is-a Actor, Y is-a Country, Z has-birthplace Country, ...
# If you know about pattern.graph (or graphs in general),
# this triple format should look familiar.
# this triple format should look familiar.

# DBPedia can be queried using SPARQL:
# DBPedia can be queried using SPARQL:
# http://dbpedia.org/sparql
# http://www.w3.org/TR/rdf-sparql-query/
# A SPARQL query yields rows that match all triples in the WHERE clause.
Expand All @@ -32,9 +32,9 @@
}
"""
for result in dbp.search(q, start=1, count=10):
print result.actor
print
print(result.actor)
print("")

# You may notice that each Result.actor is of the form:
# "http://dbpedia.org/resource/[NAME]"
# This kind of string is a subclass of unicode: DBPediaResource.
Expand All @@ -51,8 +51,8 @@
order by ?actor
"""
for r in dbp.search(q, start=1, count=10):
print "%s (%s)" % (r.actor.name, r.place.name)
print
print("%s (%s)" % (r.actor.name, r.place.name))
print("")

# You will notice that the results now include duplicates,
# the same actor with a city name, and with a country name.
Expand All @@ -75,8 +75,8 @@
order by ?date
"""
for r in dbp.search(q, start=1, count=10):
print "%s (%s)" % (r.actor.name, r.date)
print
print("%s (%s)" % (r.actor.name, r.date))
print("")

# We could also make this query shorter,
# by combining the two ?actor triples into one:
Expand All @@ -97,8 +97,8 @@
order by ?actor
"""
for r in dbp.search(q, start=1, count=10):
print "%s (%s)" % (r.actor, r.place)
print
print("%s (%s)" % (r.actor, r.place))
print("")

# This extracts a German label for each matched DBPedia resource.
# - X is an actor,
Expand All @@ -109,13 +109,13 @@

# For example, say one of the matched resources was:
# "<http://dbpedia.org/page/Erwin_Schrödinger>"
# If you open this URL in a browser,
# If you open this URL in a browser,
# you will see all the available semantic properties and their values.
# One of the properties is "rdfs:label": a human-readable & multilingual label.

# 5) Find triples involving cats.

# <http://purl.org/dc/terms/subject>
# <http://purl.org/dc/terms/subject>
# means: "is in the category of".
q = """
prefix dbo: <http://dbpedia.org/ontology/>
Expand All @@ -129,8 +129,8 @@
} order by ?cat
"""
for r in dbp.search(q, start=1, count=10):
print "%s ---%s--> %s" % (r.cat.name, r.relation.ljust(10, "-"), r.concept)
print
print("%s ---%s--> %s" % (r.cat.name, r.relation.ljust(10, "-"), r.concept))
print("")

# 6) People whose first name includes "Édouard"

Expand All @@ -144,5 +144,5 @@
}
"""
for result in dbp.search(q, start=1, count=10, cached=False):
print "%s (%s)" % (result.person.name, result.name)
print
print("%s (%s)" % (result.person.name, result.name))
print("")
Loading

0 comments on commit a34c683

Please sign in to comment.