In [1]:
import csv
import json

## Working with CSV data

In [2]:
with open('store_data.csv') as fp:
    rdr = csv.DictReader(fp)
    for i, row in enumerate(rdr):
        print row
        if i > 10:
            break
        

{'date': '2000-01-02', 'Volume': '100.476289298', 'region': 'N', 'Orders': '5', 'store': '0'}
{'date': '2000-01-02', 'Volume': '48.7764146112', 'region': 'N', 'Orders': '2', 'store': '1'}
{'date': '2000-01-02', 'Volume': '59.7178544882', 'region': 'N', 'Orders': '3', 'store': '2'}
{'date': '2000-01-02', 'Volume': '37.3457794617', 'region': 'N', 'Orders': '2', 'store': '3'}
{'date': '2000-01-02', 'Volume': '41.3322711685', 'region': 'E', 'Orders': '2', 'store': '0'}
{'date': '2000-01-02', 'Volume': '62.2623378721', 'region': 'E', 'Orders': '3', 'store': '1'}
{'date': '2000-01-02', 'Volume': '33.1530011664', 'region': 'E', 'Orders': '2', 'store': '2'}
{'date': '2000-01-02', 'Volume': '86.278202688', 'region': 'E', 'Orders': '4', 'store': '3'}
{'date': '2000-01-02', 'Volume': '108.318439119', 'region': 'S', 'Orders': '6', 'store': '0'}
{'date': '2000-01-02', 'Volume': '65.9870455892', 'region': 'S', 'Orders': '3', 'store': '1'}
{'date': '2000-01-02', 'Volume': '52.6408312531', 'region': '

In [3]:
with open('store_data.csv') as fp:
    rdr = csv.reader(fp)
    for i, row in enumerate(rdr):
        print row
        if i > 10:
            break

['date', 'region', 'store', 'Orders', 'Volume']
['2000-01-02', 'N', '0', '5', '100.476289298']
['2000-01-02', 'N', '1', '2', '48.7764146112']
['2000-01-02', 'N', '2', '3', '59.7178544882']
['2000-01-02', 'N', '3', '2', '37.3457794617']
['2000-01-02', 'E', '0', '2', '41.3322711685']
['2000-01-02', 'E', '1', '3', '62.2623378721']
['2000-01-02', 'E', '2', '2', '33.1530011664']
['2000-01-02', 'E', '3', '4', '86.278202688']
['2000-01-02', 'S', '0', '6', '108.318439119']
['2000-01-02', 'S', '1', '3', '65.9870455892']
['2000-01-02', 'S', '2', '3', '52.6408312531']


In [4]:
from collections import namedtuple
from datetime import datetime

In [8]:
import logging
log = logging.getLogger()
StoreOrders = namedtuple('StoreOrders', 'date region store orders volume')

def store_data_reader(fp):
    rdr = csv.reader(fp)
    header = rdr.next()
    for row in rdr:
        try:
            d = StoreOrders(
                date=datetime.strptime(row[0], '%Y-%m-%d'),
                region=row[1],
                store=row[2],
                orders=int(row[3]),
                volume=float(row[4]))
            yield d
        except Exception as err:
            log.warning('Could not convert row: %s', row)
            raise
        
with open('store_data.csv') as fp:
    rdr = store_data_reader(fp)
    for i, row in enumerate(rdr):
        print row
        if i > 10:
            break
    

StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='N', store='0', orders=5, volume=100.476289298)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='N', store='1', orders=2, volume=48.7764146112)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='N', store='2', orders=3, volume=59.7178544882)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='N', store='3', orders=2, volume=37.3457794617)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='E', store='0', orders=2, volume=41.3322711685)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='E', store='1', orders=3, volume=62.2623378721)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='E', store='2', orders=2, volume=33.1530011664)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='E', store='3', orders=4, volume=86.278202688)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='S', store='0', orders=6, volume=108.318439119)
StoreOrders(date=dat

In [11]:
def store_data_converter(row):
    d = StoreOrders(
        date=datetime.strptime(row[0], '%Y-%m-%d'),
        region=row[1],
        store=row[2],
        orders=int(row[3]),
        volume=float(row[4]))
    return d
        
with open('store_data.csv') as fp:
    row_iter = csv.reader(fp)
    row_iter.next()
    data = (store_data_converter(row) for row in row_iter)
    for i, row in enumerate(data):
        print row
        if i > 10:
            break


StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='N', store='0', orders=5, volume=100.476289298)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='N', store='1', orders=2, volume=48.7764146112)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='N', store='2', orders=3, volume=59.7178544882)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='N', store='3', orders=2, volume=37.3457794617)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='E', store='0', orders=2, volume=41.3322711685)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='E', store='1', orders=3, volume=62.2623378721)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='E', store='2', orders=2, volume=33.1530011664)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='E', store='3', orders=4, volume=86.278202688)
StoreOrders(date=datetime.datetime(2000, 1, 2, 0, 0), region='S', store='0', orders=6, volume=108.318439119)
StoreOrders(date=dat

In [12]:
from itertools import groupby

def agg_orders_by_date(sd_iter):
    for dt, dt_iter in groupby(sd_iter, key=lambda sd: sd.date):
        dt_data = list(dt_iter)
        sum_orders = sum(sd.orders for sd in dt_data)
        sum_volume = sum(sd.volume for sd in dt_data)
        yield dict(
            date=dt, 
            mean_orders=float(sum_orders) / len(dt_data),
            mean_volume=1.0 * sum_volume / len(dt_data))

with open('store_data.csv') as fp:
    rdr = store_data_reader(fp)
    for i, stats in enumerate(agg_orders_by_date(rdr)):
        print stats
        if i > 10:
            break


{'date': datetime.datetime(2000, 1, 2, 0, 0), 'mean_orders': 3.25, 'mean_volume': 64.02922362699374}
{'date': datetime.datetime(2000, 1, 9, 0, 0), 'mean_orders': 4.75, 'mean_volume': 89.8130480785125}
{'date': datetime.datetime(2000, 1, 16, 0, 0), 'mean_orders': 4.0625, 'mean_volume': 83.17484736618125}
{'date': datetime.datetime(2000, 1, 23, 0, 0), 'mean_orders': 4.0625, 'mean_volume': 82.6870601348375}
{'date': datetime.datetime(2000, 1, 30, 0, 0), 'mean_orders': 3.6875, 'mean_volume': 71.70034689815}
{'date': datetime.datetime(2000, 2, 6, 0, 0), 'mean_orders': 3.25, 'mean_volume': 66.284875965325}
{'date': datetime.datetime(2000, 2, 13, 0, 0), 'mean_orders': 3.125, 'mean_volume': 60.6093959807125}
{'date': datetime.datetime(2000, 2, 20, 0, 0), 'mean_orders': 4.625, 'mean_volume': 90.9661316158625}
{'date': datetime.datetime(2000, 2, 27, 0, 0), 'mean_orders': 4.875, 'mean_volume': 94.37770537670625}
{'date': datetime.datetime(2000, 3, 5, 0, 0), 'mean_orders': 3.875, 'mean_volume': 78

In [17]:
with open('store_data.csv') as fp_i, open('store_data.jsonlines', 'wb') as fp_o:
    rdr = store_data_reader(fp_i)
    for i, stats in enumerate(agg_orders_by_date(rdr)):
        stats['date'] = stats['date'].isoformat()
        json.dump(stats, fp_o)
        fp_o.write('\n')

In [14]:
with open('store_data.jsonlines') as fp:
    for i, line in enumerate(fp):
        print i+1, line.strip()
        if i > 5: break

1 {"date": "2000-01-02T00:00:00", "mean_orders": 3.25, "mean_volume": 64.02922362699374}
2 {"date": "2000-01-09T00:00:00", "mean_orders": 4.75, "mean_volume": 89.8130480785125}
3 {"date": "2000-01-16T00:00:00", "mean_orders": 4.0625, "mean_volume": 83.17484736618125}
4 {"date": "2000-01-23T00:00:00", "mean_orders": 4.0625, "mean_volume": 82.6870601348375}
5 {"date": "2000-01-30T00:00:00", "mean_orders": 3.6875, "mean_volume": 71.70034689815}
6 {"date": "2000-02-06T00:00:00", "mean_orders": 3.25, "mean_volume": 66.284875965325}
7 {"date": "2000-02-13T00:00:00", "mean_orders": 3.125, "mean_volume": 60.6093959807125}


In [18]:
from collections import defaultdict

def agg_orders_by_region(sd_iter):
    total = defaultdict(float)
    count = defaultdict(int)
    for sd in sd_iter:
        total[sd.region] += sd.orders
        count[sd.region] += 1
    return dict(
        (rgn, total[rgn] / count[rgn])
        for rgn in total)

with open('store_data.csv') as fp:
    rdr = store_data_reader(fp)
    print agg_orders_by_region(rdr)


{'S': 3.958219178082192, 'E': 4.1219178082191785, 'W': 4.067123287671233, 'N': 4.001369863013698}


In [9]:
with open('store_data.csv') as fp:
    lines = fp.readlines()
print len(lines)

5841


## Encoding

In [20]:
!pip install chardet

Collecting chardet
  Downloading chardet-2.3.0.tar.gz (164kB)
[K    100% |████████████████████████████████| 174kB 386kB/s 
[?25hBuilding wheels for collected packages: chardet
  Running setup.py bdist_wheel for chardet ... [?25l- \ done
[?25h  Stored in directory: /Users/rick446/Library/Caches/pip/wheels/28/8c/bf/a69199bd4901d84e13362f95a9ea7bc9a691fed2d655a90bc4
Successfully built chardet
Installing collected packages: chardet
Successfully installed chardet-2.3.0


In [21]:
import chardet

In [23]:
chardet.detect(open('store_data.csv').read())

{'confidence': 1.0, 'encoding': 'ascii'}

In [27]:
text = u'''信息和新闻资讯的领先提供商'''

In [31]:
chardet.detect(text.encode('utf8'))

{'confidence': 0.99, 'encoding': 'utf-8'}

In [32]:
import re

In [33]:
s = '1 + 1'
re_integer = re.compile(r'[\d]+')
re_operator = re.compile(r'[+-/*]')

In [50]:
re_either = re.compile(r'''
(?P<int>[\d]+)    # integers
|(?P<op>[+-/*])   # operator
|(?P<ws>[ ]+)
''', re.VERBOSE)
re_either.match(s).groupdict()

{'int': '1', 'op': None, 'ws': None}

In [51]:
for token in re_either.finditer(s):
    print token.groupdict(), token.group(0)

{'int': '1', 'ws': None, 'op': None} 1
{'int': None, 'ws': ' ', 'op': None}  
{'int': None, 'ws': None, 'op': '+'} +
{'int': None, 'ws': ' ', 'op': None}  
{'int': '1', 'ws': None, 'op': None} 1


In [57]:
import tokenize
import StringIO
fp = StringIO.StringIO(s)

In [58]:
tokenize.tokenize(fp.readline)

1,0-1,1:	NUMBER	'1'
1,2-1,3:	OP	'+'
1,4-1,5:	NUMBER	'1'
2,0-2,0:	ENDMARKER	''


In [59]:
import ast

In [69]:
ast.literal_eval('[1,2,3,4,5]')

[1, 2, 3, 4, 5]