In [171]:
import json
import unicodecsv as csv
import unicodedata
import codecs

In [175]:
class TweetReader:
    """
    Utility class providing conversation of our dumbly formatted
    files containing scraped tweets into python dictionaries
    sequententially, which can then be easily applied to clean and output 
    to a file.
    """
    def __init__(self, infile, outfile):
        self.infile_str = infile
        self.outfile_str = outfile
        self.infile = codecs.open(infile, 'r', encoding='utf-8')
        self.base = ""
        
        if not self.outfile_str is None:
            self.outfile = open(outfile, 'w')
        else:
            self.outfile = None
    
    def init_outfile(self, outfile_str):
        if not self.outfile is None and self.outfile_str is None:
            raise Exception('Use update method to update Reader outfile.')
        
        self.outfile_str = outfile_str
        self.outfile = open(self.outfile_str, 'w')
        return True
    
    def update_outfile(self, outfile_str):
        if self.outfile is None or self.outfile_str is None:
            raise Exception('Use init method to initialize Reader outfile.')
        
        self.outfile_str = outfile_str
        self.outfile = open(self.outfile_str,'w')
              
    def __iter__(self):
        return self
    """
    Attempt to read an object from the input file. Malformed input file will cause an exception.
    """
    def next(self):
        def to_json(st):
            return json.loads(st)
        
        assert(self.base is "" or self.base is "{")
        
        while True:
            try:
                line = next(self.infile)
            except StopIteration:
                return to_json(self.base)
                
            if '}{' in line:
                self.base += '}'
                obj = json.loads(self.base)
                self.base = '{'
                return obj
            else:
                self.base += line
    """
    Attempt to read an input JSON object, then apply the argument clean 
    """
    def read_and_clean(self, clean_function):
        obj = self.next()
    
        cleaned = clean_function(obj)
        return cleaned
        
    def clean_and_write(self, clean_function):
        if self.outfile is None:
            raise Exception("Method requires outfile.")
        
        obj = self.read_and_clean(clean_function)
        writer = csv.DictWriter(self.outfile, fieldnames=obj.keys())
        writer.writerow(obj)

In [176]:
tr = TweetReader('firstround/sunday27.json', 'example.csv')

In [179]:
def clean(tw):
    return {'text':tw['text'], 'time':tw['timestamp_ms']}

In [180]:
for _ in range(100):
    tr.clean_and_write(clean)