Permalink
Browse files

testing tokenizer

  • Loading branch information...
1 parent e1309d4 commit c46a0d391e74882f097e6769119b93cedd240d1b Pete Skomoroch committed Mar 29, 2010
No changes.
@@ -58,15 +58,17 @@ std_location_tweets = FOREACH std_location_tweets GENERATE
$4 as fips,
$5 as geonameid,
$2 as tweet_created_at,
-$0 as tweet_text;
+LOWER($0) as tweet_text;
-DEFINE tweet_tokenizer `tweet_tokenizer.py`
- SHIP ('tweet_tokenizer.py', 'nltkandyaml.mod', 's3://where20demo/wikiphrases.pkl');
-tweet_ngrams = STREAM std_location_tweets THROUGH tweet_tokenizer
- AS (ngram:chararray, fipscode:chararray, geonameid:int, date:int, hour:int);
+store std_location_tweets INTO 'std_location_tweets';
+
+--DEFINE tweet_tokenizer `tweet_tokenizer.py`
+-- SHIP ('tweet_tokenizer.py', 'nltkandyaml.mod', 's3://where20demo/wikiphrases.pkl');
+--tweet_ngrams = STREAM std_location_tweets THROUGH tweet_tokenizer
+-- AS (ngram:chararray, fipscode:chararray, geonameid:int, date:chararray, hour:int, daily_trend:float);
-rmf tweet_ngrams
-store tweet_ngrams into 'tweet_ngrams';
+--rmf tweet_ngrams
+--store tweet_ngrams into 'tweet_ngrams';
@@ -11,6 +11,9 @@
import os
import zipimport
import cPickle as pickle
+import rfc822
+import time
+from datetime import date
# load NLTK from distributed cache
importer = zipimport.zipimporter('nltkandyaml.mod')
@@ -22,10 +25,16 @@
wikiphrases = pickle.load(pkl_file)
def gethour(timestamp):
- pass
+ ''' convert timestamp of form: "Mon Mar 22 02:23:53 +0000 2010" '''
+ timeval = time.mktime(rfc822.parsedate(timestamp))
+ dateval = date.fromtimestamp(timeval)
+ return dateval.hour
def getdate(timestamp):
- pass
+ ''' convert timestamp of form: "Mon Mar 22 02:23:53 +0000 2010" '''
+ timeval = time.mktime(rfc822.parsedate(timestamp))
+ dateval = date.fromtimestamp(timeval)
+ return dateval.isoformat()
def tokenize(text):
tokenizer = nltk.tokenize.punkt.PunktWordTokenizer()
@@ -40,7 +49,7 @@ def emit_phrases(ngrams, fipscode, geonameid, date, hour):
'''Validate ngrams against wikipedia phrases and emit to stdout'''
for ngram in ngrams:
if wikiphrases.has_key(ngram):
- print '\t'.join([ngram, fipscode, geonameid, date, hour])
+ print '\t'.join([ngram, fipscode, geonameid, date, hour, str(wikiphrases[ngram])])
for line in sys.stdin:
try:
Oops, something went wrong.

0 comments on commit c46a0d3

Please sign in to comment.