## LA Parking Citation Exploratory Data Analysis

#### Fisher Ankney
#### October 27th, 2018

<br> 

data avaliable on Kaggle or https://data.lacity.org/

This goal of this notebook is to trim the dataset from 7.2 million lines, to a more manageable size, specifically one that can be stored on github (< 100 mb files)

### Load Libraries

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

### Verify File Size

In [2]:
# return file size in GB
import os
os.path.getsize('parking_citation.csv') / (1*10**9)

1.021944555

### Read in the Dataset

In [3]:
la_ticket_full = pd.read_csv("~/Documents/data_science/py_la_tickets/parking_citation.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
la_ticket_full.tail()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,Color,Location,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude
7201001,4334262026,2018/05/29 12:00:00 AM,2321.0,,,CA,200608.0,,HOND,PA,GY,9400 HAAS AVE,531,55.0,80.56E4+,RED ZONE,93.0,6465617.0,1804786.0
7201002,4334262030,2018/05/29 12:00:00 AM,2322.0,,,CA,200608.0,,HOND,PA,GY,9400 HAAS AVE,531,55.0,5204A-,DISPLAY OF TABS,25.0,6465617.0,1804786.0
7201003,4334262041,2018/05/29 12:00:00 AM,2338.0,,,CA,201811.0,,BMW,PA,GY,7500 BUDLONG AVE,531,55.0,80.56E4+,RED ZONE,93.0,6471860.0,1812272.0
7201004,4334262052,2018/05/29 12:00:00 AM,2341.0,,,CA,201809.0,,CHEV,PA,GY,944 76TH ST W,531,55.0,22514,FIRE HYDRANT,68.0,6473632.0,1811856.0
7201005,4334262063,2018/05/29 12:00:00 AM,2348.0,,,CA,201810.0,,HOND,PA,GY,618 76TH ST W,531,55.0,22500E,BLOCKING DRIVEWAY,68.0,6475427.0,1811860.0


### Index Dataset for 2017 Results Only

In [5]:
la_ticket_issue = la_ticket_full['Issue Date']
la_ticket_issue.tail()

7201001    2018/05/29 12:00:00 AM
7201002    2018/05/29 12:00:00 AM
7201003    2018/05/29 12:00:00 AM
7201004    2018/05/29 12:00:00 AM
7201005    2018/05/29 12:00:00 AM
Name: Issue Date, dtype: object

In [6]:
type(la_ticket_issue)

pandas.core.series.Series

In [14]:
sum(la_ticket_issue.str.contains('2017') == True)

2254329

In [15]:
la_ticket_2017_index = la_ticket_issue.str.contains('2017')

In [33]:
la_ticket_2017 = la_ticket_full[la_ticket_2017_index]

In [17]:
len(la_ticket_2017)

2254329

#### Confirm Dataset Retains Essential Variables and spans 2017

In [18]:
la_ticket_2017.tail()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,Color,Location,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude
7094534,1116259045,2017/04/17 12:00:00 AM,2008.0,,,NV,201812.0,,CHRY,PA,GY,2800 EAST OBSERVATOR,,4.0,8056E4,RED ZONE,93.0,99999.0,99999.0
7133441,1116014793,2017/11/05 12:00:00 AM,150.0,,,CA,201709.0,,ACUR,PA,GY,BURNS E/O VIRGIL,,1.0,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0
7171053,1114199203,2017/10/15 12:00:00 AM,1637.0,,,CA,201801.0,,HOND,PA,BK,2167 E 102ND ST,01829,1.0,80714,PRIVATE PROPERTY,68.0,6490781.0,1802058.0
7171388,1121068082,2017/04/16 12:00:00 AM,735.0,,,CA,201706.0,,CHEV,PA,BL,6701 DE SOTO AVE,MQ,1.0,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0
7171390,1122798622,2017/03/05 12:00:00 AM,1635.0,,,CA,201805.0,,TOYO,PA,GY,101 WORLD WAY,5M38,2.0,225078A,HANDICAP/NO DP ID,363.0,6439738.0,1802687.0


In [19]:
min(la_ticket_2017['Issue Date'])

'2017/01/01 12:00:00 AM'

In [20]:
max(la_ticket_2017['Issue Date'])

'2017/12/31 12:00:00 AM'

### Split Issue Date into Month / Day

In [40]:
date_split_1 = la_ticket_2017['Issue Date'].str.split('/', n=3, expand=True)
date_split_1.head()

Unnamed: 0,0,1,2
2735704,2017,12,18 12:00:00 AM
2771883,2017,5,11 12:00:00 AM
2777524,2017,3,01 12:00:00 AM
2777558,2017,3,28 12:00:00 AM
2777651,2017,5,15 12:00:00 AM


In [41]:
date_split_2 = date_split[2].str.split(' ', n=1, expand = True)
second_split.head()

Unnamed: 0,0,1
2735704,18,12:00:00 AM
2771883,11,12:00:00 AM
2777524,1,12:00:00 AM
2777558,28,12:00:00 AM
2777651,15,12:00:00 AM


In [46]:
# making seperate first name column from new data frame 
warnings.filterwarnings('ignore')

la_ticket_2017["Month"] = date_split_1[1] 
la_ticket_2017["Day"] = date_split_2[0]
la_ticket_2017.head()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,...,Location,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude,Month,Day
2735704,1115377911,2017/12/18 12:00:00 AM,2205.0,,,CA,201712.0,,HOND,PA,...,1323 S FLOWER ST,00192,1.0,4000A1,NO EVIDENCE OF REG,50.0,6480729.0,1836883.0,12,18
2771883,1114752936,2017/05/11 12:00:00 AM,800.0,,,CA,201712.0,,FRHT,TR,...,INDIANA/NOAKES,CM99,1.0,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0,5,11
2777524,4302749861,2017/03/01 12:00:00 AM,104.0,,,OR,3.0,,TOYT,PA,...,1822 WINONA BLVD,00402,54.0,80.56E4+,RED ZONE,93.0,6470239.0,1860397.0,3,1
2777558,1120840291,2017/03/28 12:00:00 AM,1050.0,,,CA,201708.0,,HOND,PA,...,710 EL CENTRO AV,00001,1.0,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0,3,28
2777651,4308029526,2017/05/15 12:00:00 AM,134.0,,,CA,,,LEXS,PA,...,1701 VINE ST,00402,54.0,80.69B,NO PARKING,73.0,6462770.0,1859525.0,5,15


### Remove 'Useless' Variables

In [47]:
la_ticket_2017_reduced = la_ticket_2017.drop(['Ticket number', 'Issue Date','Plate Expiry Date', 'Body Style', 'VIN', 'Marked Time', 'Route', 'Agency'], axis=1)

In [49]:
la_ticket_2017_reduced.head()

Unnamed: 0,Issue time,Meter Id,RP State Plate,Make,Color,Location,Violation code,Violation Description,Fine amount,Latitude,Longitude,Month,Day
2735704,2205.0,,CA,HOND,BK,1323 S FLOWER ST,4000A1,NO EVIDENCE OF REG,50.0,6480729.0,1836883.0,12,18
2771883,800.0,,CA,FRHT,WH,INDIANA/NOAKES,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0,5,11
2777524,104.0,,OR,TOYT,BL,1822 WINONA BLVD,80.56E4+,RED ZONE,93.0,6470239.0,1860397.0,3,1
2777558,1050.0,,CA,HOND,BK,710 EL CENTRO AV,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0,3,28
2777651,134.0,,CA,LEXS,BL,1701 VINE ST,80.69B,NO PARKING,73.0,6462770.0,1859525.0,5,15


### Write Reduced 2017 Dataset

In [50]:
la_ticket_2017_reduced.to_csv('la_ticket_2017.csv')

In [51]:
# check size again
os.path.getsize('la_ticket_2017.csv') / (1*10**9)
# still too big, must be < 100 mb

0.253930927

### Divide Dataset into 3 parts for github upload

In [59]:
seq_1 = ['01','02','03']
seq_2 = ['04','05','06']
seq_3 = ['07','08','09']
seq_4 = ['10','11','12']

In [68]:
index_1 = la_ticket_2017_reduced['Month'].isin(seq_1)
index_2 = la_ticket_2017_reduced['Month'].isin(seq_2)
index_3 = la_ticket_2017_reduced['Month'].isin(seq_3)
index_4 = la_ticket_2017_reduced['Month'].isin(seq_4)

In [69]:
la_ticket_2017_1 = la_ticket_2017_reduced[index_1]
la_ticket_2017_2 = la_ticket_2017_reduced[index_2]
la_ticket_2017_3 = la_ticket_2017_reduced[index_3]
la_ticket_2017_4 = la_ticket_2017_reduced[index_4]

In [81]:
la_ticket_2017_4.shape

(534139, 13)

In [71]:
la_ticket_2017_1.to_csv('la_ticket_2017_1.csv')
la_ticket_2017_2.to_csv('la_ticket_2017_2.csv')
la_ticket_2017_3.to_csv('la_ticket_2017_3.csv')
la_ticket_2017_4.to_csv('la_ticket_2017_4.csv')

In [72]:
os.path.getsize('la_ticket_2017_1.csv') / (1*10**9)

0.063464771

In [73]:
os.path.getsize('la_ticket_2017_2.csv') / (1*10**9)

0.066494809

In [74]:
os.path.getsize('la_ticket_2017_3.csv') / (1*10**9)

0.063882298

In [75]:
os.path.getsize('la_ticket_2017_4.csv') / (1*10**9)

0.060089451

### Trim Complete