In [1]:
import csv
import mapreduce as mr
import dateutil
from dateutil import parser
import operator

#1. Weekday

In [2]:
#Calculating sum of complaints per zipcode per weekday

def mapper1(row):
    weekday = (row['Incident Zip'], dateutil.parser.parse(row['Created Date']).weekday())
    yield (weekday, 1)

def reducer1(k2v2):
    weekday, count = k2v2
    return (weekday, sum(count)) 

with open('noise_test.csv', 'r') as fi:
    reader = csv.DictReader(fi)
    output1 = list(mr.run(reader, mapper1, reducer1))

output1[6:11]

[(('10001', 0), 6),
 (('10001', 1), 12),
 (('10001', 2), 8),
 (('10001', 4), 15),
 (('10001', 5), 15)]

In [3]:
#Getting unique zipcodes

zipcodes = []

for i in output1:
    zipcodes.append(i[0][0])
    
zipcodes = list(set(zipcodes))

zipcodes[:10]

['',
 '10065',
 '10453',
 '10452',
 '10451',
 '10457',
 '10456',
 '10455',
 '10454',
 '10459']

In [4]:
#Creating dictionaries for each zipcode

weekday = {}

for i in zipcodes:
    weekday[i] = []
    
for i in output1:
    weekday[i[0][0]].append(((i[0][1]), i[1]))
    
weekday['11216']

[(0, 7), (1, 9), (2, 10), (4, 26), (5, 30), (6, 20)]

#2. Hour

In [5]:
#Calculating sum of complaints per zipcode per hour

def mapper1(row):
    hour = (row['Incident Zip'], dateutil.parser.parse(row['Created Date']).hour)
    yield (hour, 1)

def reducer1(k2v2):
    hour, count = k2v2
    return (hour, sum(count)) 

with open('noise_test.csv', 'r') as fi:
    reader = csv.DictReader(fi)
    output2 = list(mr.run(reader, mapper1, reducer1))

output2[15:20]

[(('10001', 0), 8),
 (('10001', 1), 7),
 (('10001', 2), 5),
 (('10001', 4), 2),
 (('10001', 6), 1)]

In [6]:
#Creating dictionaries for each zipcode

hour = {}

for i in zipcodes:
    hour[i] = []
    
for i in output2:
    hour[i[0][0]].append(((i[0][1]), i[1]))
    
hour['11216']

[(0, 10),
 (1, 23),
 (2, 4),
 (3, 8),
 (4, 3),
 (5, 1),
 (7, 4),
 (9, 3),
 (10, 1),
 (11, 4),
 (12, 1),
 (14, 1),
 (16, 3),
 (17, 3),
 (18, 2),
 (19, 6),
 (20, 3),
 (21, 6),
 (22, 3),
 (23, 13)]

#3. Descriptor

In [7]:
#Calculating sum of complaints per zipcode per descriptor

def mapper3(row):
    descriptor = (row['Incident Zip'], row['Descriptor'])
    yield (descriptor, 1)

def reducer3(k2v2):
    descriptor, count = k2v2
    return (descriptor, sum(count)) 

with open('noise_test.csv', 'r') as fi:
    reader = csv.DictReader(fi)
    output3 = list(mr.run(reader, mapper3, reducer3))

output3[7:12]

[(('10001', 'Banging/Pounding'), 9),
 (('10001', 'Car/Truck Horn'), 1),
 (('10001', 'Car/Truck Music'), 1),
 (('10001', 'Engine Idling'), 2),
 (('10001', 'Loud Music/Party'), 11)]

In [8]:
# Creating dictionaries for each zipcode

descriptor = {}

for i in zipcodes:
    descriptor[i] = []
    
for i in output3:
    descriptor[i[0][0]].append(((i[0][1]), i[1]))

#Extracting top 5 complaints per zipcode 

for i in zipcodes:
    descriptor[i].sort(key=operator.itemgetter(1), reverse=True)
    descriptor[i] = descriptor[i][:5]
    
descriptor['10005']

[('Noise: Construction Before/After Hours (NM1)', 6), ('Loud Music/Party', 1)]

#4. Total complaints (+ bottom 10/top 10)

In [9]:
#Calculating sum of complaints per zipcode per hour

def mapper4(row):
    zipcode = row['Incident Zip']
    yield (zipcode, 1)

def reducer4(k2v2):
    zipcode, count = k2v2
    return (zipcode, sum(count)) 

with open('noise_test.csv', 'r') as fi:
    reader = csv.DictReader(fi)
    output4 = list(mr.run(reader, mapper4, reducer4))

output4[:5]

[('', 25), ('10001', 65), ('10002', 107), ('10003', 84), ('10004', 5)]

In [10]:
#Ordering zipcodes by number of noise complaints

output4.sort(key=operator.itemgetter(1))

#Zipcodes with the least complaints

output4[:10]

[('10282', 1),
 ('11363', 1),
 ('11426', 1),
 ('10044', 2),
 ('10464', 2),
 ('11109', 2),
 ('11357', 2),
 ('11364', 2),
 ('11413', 2),
 ('11239', 3)]

In [11]:
#Zipcodes with the most complaints

output4[len(output4)-10:len(output4)]

[('11211', 103),
 ('10002', 107),
 ('10029', 107),
 ('10467', 110),
 ('10009', 120),
 ('10034', 128),
 ('11226', 140),
 ('10031', 141),
 ('11238', 143),
 ('10040', 148)]

#5. Normalized complaints (+ bottom 10/top 10)

In [12]:
def mapper5(row):
    yield (row['Zipcode'], int(row['Total']))

def reducer5(k2v2):
    population, count = k2v2
    return (population, sum(count))
    
with open('Census_2010.csv', 'r') as fi:
    reader = csv.DictReader(fi)
    output5 = list(mr.run(reader, mapper5, reducer5))

output5[:10]

[('10001', 21102),
 ('10002', 81410),
 ('10003', 56024),
 ('10004', 3089),
 ('10005', 7135),
 ('10006', 3011),
 ('10007', 6988),
 ('10009', 61347),
 ('10010', 31834),
 ('10011', 50984)]

In [13]:
#Creating dictionaries for total complaints and population

complaints = {}
population = {}

for i in output4:
    complaints[i[0]] = i[1] 
    
for i in output5:
    population[i[0]] = i[1] 
    
print complaints['11216']
print population['11216']

102
54316


In [14]:
#Normalizing number of complaints per population

normalized = {}

for i in zipcodes:
    if i in complaints.keys() and i in population.keys() and population[i]>0:
        normalized[i] = complaints[i]/float(population[i])
    else:
        pass
    
normalized['10002']

0.0013143348482987348

In [15]:
#Creating list of tuples

normalized_list = []

for i in normalized:
    normalized_list.append((i, normalized[i]))
    
normalized_list.sort(key=operator.itemgetter(1))

#Bottom 10 zipcodes (normalized)

normalized_list[:10]

[('11357', 5.108556832694764e-05),
 ('11413', 5.139802631578947e-05),
 ('11426', 5.6850483229107444e-05),
 ('11364', 5.787874403125452e-05),
 ('10312', 8.431134493457439e-05),
 ('11422', 9.860312243221036e-05),
 ('11412', 0.00011467232383464251),
 ('10475', 0.0001221568004690821),
 ('10309', 0.00012300501245425752),
 ('11363', 0.00014310246136233542)]

In [16]:
#Top 10 zipcodes (normalized)

normalized_list[len(normalized_list)-10:len(normalized_list)]

[('10065', 0.002200185931205454),
 ('10030', 0.00233341975628727),
 ('10031', 0.0024983167369502816),
 ('10014', 0.002878688319409243),
 ('11238', 0.0029028460070642686),
 ('10001', 0.0030802767510188607),
 ('10039', 0.0031393973987850123),
 ('10034', 0.0032898118638840343),
 ('10040', 0.003531798114783439),
 ('10006', 0.009631351710395218)]

#6. Construction time

In [17]:
#Calculating sum of complaints per zipcode per descriptor/hour

def mapper6(row):
    descriptor = (row['Incident Zip'], row['Descriptor'], dateutil.parser.parse(row['Created Date']).hour)
    yield (descriptor, 1)

def reducer6(k2v2):
    descriptor, count = k2v2
    return (descriptor, sum(count)) 

with open('noise_test.csv', 'r') as fi:
    reader = csv.DictReader(fi)
    output6 = list(mr.run(reader, mapper6, reducer6))

output6[7:12]

[(('', 'Loud Music/Party', 7), 1),
 (('', 'Loud Music/Party', 12), 1),
 (('', 'Loud Music/Party', 18), 1),
 (('', 'Loud Music/Party', 21), 1),
 (('', 'Loud Music/Party', 22), 2)]

In [18]:
#Filtering output for construction noise and for illegal hours
output6_1 = filter(lambda x: ('Construction' in x[0][1]) and (7 < x[0][2] <18), output6)
output6_1[:20]

[(('', 'Noise: Construction Before/After Hours (NM1)', 16), 1),
 (('10001', 'Noise: Construction Before/After Hours (NM1)', 8), 2),
 (('10001', 'Noise: Construction Before/After Hours (NM1)', 9), 1),
 (('10001', 'Noise: Construction Before/After Hours (NM1)', 10), 2),
 (('10001', 'Noise: Construction Before/After Hours (NM1)', 13), 1),
 (('10001', 'Noise: Construction Equipment (NC1)', 11), 4),
 (('10001', 'Noise: Construction Equipment (NC1)', 14), 1),
 (('10002', 'Noise: Construction Before/After Hours (NM1)', 9), 1),
 (('10003', 'Noise: Construction Before/After Hours (NM1)', 8), 2),
 (('10003', 'Noise: Construction Before/After Hours (NM1)', 13), 1),
 (('10003', 'Noise: Construction Before/After Hours (NM1)', 17), 1),
 (('10003', 'Noise: Construction Equipment (NC1)', 9), 1),
 (('10003', 'Noise: Construction Equipment (NC1)', 15), 1),
 (('10004', 'Noise: Construction Before/After Hours (NM1)', 8), 1),
 (('10005', 'Noise: Construction Before/After Hours (NM1)', 10), 2),
 (('10006', 

In [19]:
#Getting just zipcode and total of illegal complaints
output6_2 = map(lambda x: (x[0][0], x[1]), output6_1)
output6_2

[('', 1),
 ('10001', 2),
 ('10001', 1),
 ('10001', 2),
 ('10001', 1),
 ('10001', 4),
 ('10001', 1),
 ('10002', 1),
 ('10003', 2),
 ('10003', 1),
 ('10003', 1),
 ('10003', 1),
 ('10003', 1),
 ('10004', 1),
 ('10005', 2),
 ('10006', 2),
 ('10006', 1),
 ('10006', 2),
 ('10007', 1),
 ('10007', 1),
 ('10007', 1),
 ('10009', 1),
 ('10009', 1),
 ('10009', 1),
 ('10009', 1),
 ('10009', 1),
 ('10010', 1),
 ('10010', 5),
 ('10010', 1),
 ('10010', 2),
 ('10010', 1),
 ('10010', 1),
 ('10010', 2),
 ('10010', 1),
 ('10010', 2),
 ('10010', 1),
 ('10011', 1),
 ('10011', 1),
 ('10011', 1),
 ('10012', 2),
 ('10012', 1),
 ('10013', 1),
 ('10013', 1),
 ('10013', 1),
 ('10013', 1),
 ('10014', 3),
 ('10014', 2),
 ('10014', 1),
 ('10014', 1),
 ('10014', 1),
 ('10014', 1),
 ('10014', 1),
 ('10016', 2),
 ('10016', 1),
 ('10016', 3),
 ('10016', 1),
 ('10016', 1),
 ('10016', 1),
 ('10016', 1),
 ('10016', 1),
 ('10016', 1),
 ('10016', 1),
 ('10017', 1),
 ('10017', 1),
 ('10017', 3),
 ('10017', 1),
 ('10019', 1),


In [22]:
construction = {}
norm_construction = {}

for i in zipcodes:
    construction[i] = []
    
for i in output6_2:
    construction[i[0]].append(i[1])
    
for i in construction.keys():
    norm_construction[i] = sum(construction[i])/float(complaints[i])*1000
    
norm_construction['11216']

19.607843137254903

In [27]:
mean(norm_construction.values())

NameError: name 'mean' is not defined