In [31]:
import json
import unicodecsv as csv
import unicodedata
import codecs

In [32]:
class TweetReader:
    """
    Utility class providing conversation of our dumbly formatted
    files containing scraped tweets into python dictionaries
    sequententially, which can then be easily applied to clean and output 
    to a file.
    """
    def __init__(self, infile, outfile):
        self.infile_str = infile
        self.outfile_str = outfile
        self.infile = codecs.open(infile, 'r', encoding='utf-8')
        self.line_count = 0
        self.base = ""
        
        if not self.outfile_str is None:
            self.outfile = open(outfile, 'w')
        else:
            self.outfile = None
    
    def init_outfile(self, outfile_str):
        if not self.outfile is None and self.outfile_str is None:
            raise Exception('Use update method to update Reader outfile.')
        
        self.outfile_str = outfile_str
        self.outfile = open(self.outfile_str, 'w')
        return True
    
    def update_outfile(self, outfile_str):
        if self.outfile is None or self.outfile_str is None:
            raise Exception('Use init method to initialize Reader outfile.')
        
        self.outfile_str = outfile_str
        self.outfile = open(self.outfile_str,'w')
              
    def __iter__(self):
        return self
    """
    Attempt to read an object from the input file. Malformed input file will cause an exception.
    """
    def next(self):
        def to_json(st):
            return json.loads(st)
        
        while True:
            try:
                line = next(self.infile)
                self.line_count += 1
                if self.line_count % 500000 == 0:
                    print "Hit 500000, total: {0}".format(self.line_count)
            except StopIteration:
                print "hit end of input file"
                try:
                    obj = to_json(self.base)
                    self.base = ""
                except:
                    raise StopIteration()
                
            if line.strip() == '}{':
                self.base += '}'
                try:
                    obj = json.loads(self.base)
                except Exception as e:
                    print line
                    print self.base
                    raise e
                self.base = '{'
                return obj
            else:
                self.base += line
    """
    Attempt to read an input JSON object, then apply the argument clean 
    """
    def read_and_clean(self, clean_function):
        obj = self.next()
        cleaned = clean_function(obj)
        return cleaned
    
    def write(self, obj):
        if self.outfile is None:
            raise Exception("Method requires outfile.")
            
        writer = csv.DictWriter(self.outfile, fieldnames=obj.keys())
        writer.writerow(obj)
        return True
    
    def clean_and_write(self, obj, clean):
        self.write(clean(obj))
        return True
    
    def clean_and_write_all(self,clean,count=None):
        c = 0
        error_count = 0
        for obj in self:
            if obj is None:
                continue
            self.clean_and_write(obj,clean)
            if not count is None:
                if c > count:
                    break
                else:
                    c += 1
        print "Converted {0} tweets from source file {1} to \
                output file {2}".format(c, self.infile_str, self.outfile_str)
        print "{0} write attempts failed".format(error_count)

In [33]:
tr = TweetReader('firstround/sunday27.json', 'converted/sunday3-27.csv')

In [34]:
def clean(tw):
    return {'text':tw['text'], 'time':tw['timestamp_ms']}

In [35]:
tr.clean_and_write_all(clean)

Hit 500000, total: 500000
Hit 500000, total: 1000000
Hit 500000, total: 1500000
Hit 500000, total: 2000000
Hit 500000, total: 2500000
Hit 500000, total: 3000000
Hit 500000, total: 3500000
Hit 500000, total: 4000000
Hit 500000, total: 4500000
Hit 500000, total: 5000000
Hit 500000, total: 5500000
Hit 500000, total: 6000000
Hit 500000, total: 6500000
Hit 500000, total: 7000000
Hit 500000, total: 7500000
Hit 500000, total: 8000000
Hit 500000, total: 8500000
Hit 500000, total: 9000000
Hit 500000, total: 9500000
Hit 500000, total: 10000000
Hit 500000, total: 10500000
Hit 500000, total: 11000000
Hit 500000, total: 11500000
Hit 500000, total: 12000000
Hit 500000, total: 12500000
Hit 500000, total: 13000000
Hit 500000, total: 13500000
Hit 500000, total: 14000000
hit end of input file
hit end of input file
Converted 0 tweets from source file firstround/sunday27.json to                 output file converted/sunday3-27.csv
0 write attempts failed


In [36]:
tr = TweetReader('firstround/saturday3-26.json', 'converted/saturday3-26.csv')

In [37]:
tr.clean_and_write_all(clean)

Hit 500000, total: 500000
Hit 500000, total: 1000000
Hit 500000, total: 1500000
Hit 500000, total: 2000000
Hit 500000, total: 2500000
hit end of input file
hit end of input file
Converted 0 tweets from source file firstround/saturday3-26.json to                 output file converted/saturday3-26.csv
0 write attempts failed
