Skip to content
Browse files

Modify Twitter source to filter for relevant tweets

  • Loading branch information...
1 parent aa5af50 commit dc8cac684c1644c1509f206de16f72826e7b6449 Jon Natkins committed Sep 6, 2012
Showing with 12 additions and 2 deletions.
  1. +12 −2 flume-sources/src/main/java/com/cloudera/flume/source/TwitterSource.java
View
14 flume-sources/src/main/java/com/cloudera/flume/source/TwitterSource.java
@@ -33,6 +33,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import twitter4j.FilterQuery;
import twitter4j.Status;
import twitter4j.StatusDeletionNotice;
import twitter4j.StatusListener;
@@ -105,6 +106,7 @@ public void start() {
public void onStatus(Status status) {
// The EventBuilder is used to build an event using the headers and
// the raw JSON of a tweet
+ logger.debug(status.getUser().getScreenName() + ": " + status.getText());
headers.put("timestamp", String.valueOf(System.currentTimeMillis()));
eventList.add(EventBuilder.withBody(
DataObjectFactory.getRawJSON(status).getBytes(), headers));
@@ -134,8 +136,16 @@ public void onException(Exception ex) {}
AccessToken token = new AccessToken(accessToken, accessTokenSecret);
twitterStream.setOAuthAccessToken(token);
- // Start sampling Twitter!
- twitterStream.sample();
+ // Set up a filter to pull out industry-relevant tweets
+ FilterQuery query = new FilterQuery()
+ .track(new String[] { "hadoop", "big data", "analytics",
+ "bigdata", "cloudera", "data science",
+ "data scientiest", "business intelligence",
+ "mapreduce", "data warehouse", "data warehousing",
+ "mahout", "hbase", "nosql", "newsql",
+ "businessintelligence", "cloudcomputing" })
+ .setIncludeEntities(true);
+ twitterStream.filter(query);
super.start();
}

0 comments on commit dc8cac6

Please sign in to comment.
Something went wrong with that request. Please try again.