# Data overview and cleaning
---

- [Columns](#Columns)   
- [Remove garbage values](#Remove-garbage-values)
- [Fix mixed types error](#Fix-mixed-types-error)

In [1]:
import pandas as pd

In [2]:
traffic_tickets = pd.read_csv('./Traffic_Tickets_Issued__Four_Year_Window.csv')

  interactivity=interactivity, compiler=compiler, result=result)


*This error will be addressed in the next section*

In [4]:
"Total traffic tickets: {:,}".format(len(traffic_tickets))

'Total traffic tickets: 6,298,032'

## Columns

- View dframe columns

In [5]:
for col in traffic_tickets.columns:
    print(col + "\r")

Violation Charged Code
Violation Description
Violation Year
Violation Month
Violation Day of Week
Age at Violation
Gender
State of License
Police Agency
Court
Source


- Remove unused columns

In [6]:
traffic_tickets = traffic_tickets[[
    "Violation Description", 
    "Violation Year", 
    "Violation Month",
    "Violation Day of Week", 
    "Age at Violation", 
    "State of License", 
    "Gender",
]]

traffic_tickets.columns

Index(['Violation Description', 'Violation Year', 'Violation Month',
       'Violation Day of Week', 'Age at Violation', 'State of License',
       'Gender'],
      dtype='object')

- Create shorter column names

In [7]:
traffic_tickets = traffic_tickets.rename(columns={
    'Violation Description': 'Violation', 
    'Violation Year': 'Year', 
    'Violation Month': 'Month', 
    'Violation Day of Week': 'Day',
    'Age at Violation': 'Age', 
    'State of License': 'State'
})

traffic_tickets.columns

Index(['Violation', 'Year', 'Month', 'Day', 'Age', 'State', 'Gender'], dtype='object')

## Remove missing values

In [8]:
traffic_tickets = traffic_tickets.dropna()

6230731

In [9]:
traffic_tickets.head()

Unnamed: 0,Violation,Year,Month,Day,Age,State,Gender
0,UNINSPECTED MOTOR VEHICLE,2016,3.0,TUESDAY,21.0,NEW YORK,M
1,SPEED OVER 55 ZONE,2016,3.0,SATURDAY,18.0,NEW YORK,F
2,SPEED OVER 55 ZONE,2016,2.0,WEDNESDAY,55.0,NEW YORK,M
3,SPEED IN ZONE,2016,2.0,SUNDAY,41.0,NEW YORK,M
4,SPEED OVER 55 ZONE,2016,1.0,THURSDAY,25.0,NEW YORK,M


In [10]:
traffic_tickets.tail()

Unnamed: 0,Violation,Year,Month,Day,Age,State,Gender
6298022,OPERATING MV MOBILE PHONE,2017,8.0,MONDAY,29.0,NEW YORK,M
6298023,PLATE MISSING/INSECURE/DIRTY,2017,8.0,MONDAY,46.0,NEW YORK,M
6298024,DISOBEYED TRAFFIC DEVICE,2017,8.0,MONDAY,22.0,MARYLAND,M
6298025,IMPROPER SIGNAL,2017,8.0,MONDAY,36.0,NEW JERSEY,M
6298026,COM VEH ON PKWAY NYC,2017,8.0,MONDAY,42.0,NEW YORK,M


## Fix mixed types error 

- Generate sets of each column

In [12]:
traffic_ticket_sets = {}
for col in traffic_tickets.columns:
    traffic_ticket_sets[col] = set(traffic_tickets[col])

traffic_ticket_sets.keys()

dict_keys(['Violation', 'Year', 'Month', 'Day', 'Age', 'State', 'Gender'])

- Find type(s) of each column set

In [13]:
for key in traffic_ticket_sets.keys():
    print("{0}: {1}".format(
        key,
        set([type(i) for i in traffic_ticket_sets[key]])))

Violation: {<class 'str'>}
Year: {<class 'int'>, <class 'str'>}
Month: {<class 'numpy.float64'>}
Day: {<class 'str'>}
Age: {<class 'numpy.float64'>}
State: {<class 'str'>}
Gender: {<class 'str'>}


### Clean `Year` column

In [14]:
traffic_ticket_sets["Year"]

{2016, 2017, '2017', 2014, 2015}

In [15]:
traffic_tickets = traffic_tickets.replace(to_replace="2017", value=2017)

In [16]:
set(traffic_tickets.Year)

{2014, 2015, 2016, 2017}

### Clean `State` column

In [17]:
len(traffic_ticket_sets["State"])

77

In [18]:
traffic_ticket_sets["State"]

{'ALABAMA',
 'ALASKA',
 'ALBERTA CANADA',
 'AMERICAN SAMOA',
 'ARIZONA',
 'ARKANSAS',
 'ARMED FORCES',
 'ARMED FORCES AMERICA',
 'BRITISH COLUMBIA CANADA',
 'CALIFORNIA',
 'COLORADO',
 'CONNECTICUT',
 'DELAWARE',
 'DISTRICT OF COLUMBIA',
 'FLORIDA',
 'FOREIGN LICENSE',
 'GEORGIA',
 'GUAM',
 'HAWAII',
 'IDAHO',
 'ILLINOIS',
 'INDIANA',
 'INTERNATIONAL LICENSE',
 'IOWA',
 'KANSAS',
 'KENTUCKY',
 'LOUISIANA',
 'MAINE',
 'MANITOBA',
 'MARYLAND',
 'MASSACHUSETTS',
 'MEXICO',
 'MICHIGAN',
 'MINNESOTA',
 'MISSISSIPPI',
 'MISSOURI',
 'MONTANA',
 'NEBRASKA',
 'NEVADA',
 'NEW BRUNSWICK CANADA',
 'NEW HAMPSHIRE',
 'NEW JERSEY',
 'NEW MEXICO',
 'NEW YORK',
 'NEWFOUNDLAND CANADA',
 'NORTH CAROLINA',
 'NORTH DAKOTA',
 'NORTHWEST TERRITORIES CANADA',
 'NOVA SCOTIA CANADA',
 'NUNAVUT CANADA',
 'OHIO',
 'OKLAHOMA',
 'ONTARIO CANADA',
 'OREGON',
 'PANAMA',
 'PENNSYLVANIA',
 'PRINCE EDWARD ISLAND CANADA',
 'PUERTO RICO',
 'QUEBEC CANADA',
 'RHODE ISLAND',
 'SASKATCHEWAN CANADA',
 'SOUTH CAROLINA',
 'SOUT

- Get list of US states of interest
    - [List of states repo](https://gist.github.com/iamjason/8f8f4bc00c13de86bcad)

In [19]:
states = ["Alaska",
          "Alabama",
          "Arkansas",
          "American Samoa",
          "Arizona",
          "California",
          "Colorado",
          "Connecticut",
          "District of Columbia",
          "Delaware",
          "Florida",
          "Georgia",
          "Guam",
          "Hawaii",
          "Iowa",
          "Idaho",
          "Illinois",
          "Indiana",
          "Kansas",
          "Kentucky",
          "Louisiana",
          "Massachusetts",
          "Maryland",
          "Maine",
          "Michigan",
          "Minnesota",
          "Missouri",
          "Mississippi",
          "Montana",
          "North Carolina",
          "North Dakota",
          "Nebraska",
          "New Hampshire",
          "New Jersey",
          "New Mexico",
          "Nevada",
          "New York",
          "Ohio",
          "Oklahoma",
          "Oregon",
          "Pennsylvania",
          "Puerto Rico",
          "Rhode Island",
          "South Carolina",
          "South Dakota",
          "Tennessee",
          "Texas",
          "Utah",
          "Virginia",
          "Virgin Islands",
          "Vermont",
          "Washington",
          "Wisconsin",
          "West Virginia",
          "Wyoming"]

- Uppercase states for comparison with existing states

In [20]:
states = [ s.upper() for s in states ]

- Generate list of **weird states.** A state is weird is if not in the states list

In [21]:
weird_states = [col_state for col_state in traffic_ticket_sets["State"] if
                    str(col_state) not in [fifty_state for fifty_state in states]]

print("Showing {0} weird states: \n".format(len(weird_states)))
for i, ws in enumerate(weird_states):
    print(i, ws)

Showing 22 weird states: 

0 BRITISH COLUMBIA CANADA
1 NORTHWEST TERRITORIES CANADA
2 PANAMA
3 NUNAVUT CANADA
4 NOVA SCOTIA CANADA
5 UNKNOWN
6 FOREIGN LICENSE
7 INTERNATIONAL LICENSE
8 ONTARIO CANADA
9 NEW BRUNSWICK CANADA
10 PRINCE EDWARD ISLAND CANADA
11 US GOVERNMENT
12 MEXICO
13 ARMED FORCES
14 QUEBEC CANADA
15 YUKON TERRITORY CANADA
16 SASKATCHEWAN CANADA
17 ALBERTA CANADA
18 US FOREIGN DIPLOMATS
19 ARMED FORCES AMERICA
20 MANITOBA
21 NEWFOUNDLAND CANADA


- Create new dframe without weird states

In [22]:
traffic_tickets = traffic_tickets[~traffic_tickets.State.isin(weird_states)]

In [23]:
set(traffic_tickets.State)

{'ALABAMA',
 'ALASKA',
 'AMERICAN SAMOA',
 'ARIZONA',
 'ARKANSAS',
 'CALIFORNIA',
 'COLORADO',
 'CONNECTICUT',
 'DELAWARE',
 'DISTRICT OF COLUMBIA',
 'FLORIDA',
 'GEORGIA',
 'GUAM',
 'HAWAII',
 'IDAHO',
 'ILLINOIS',
 'INDIANA',
 'IOWA',
 'KANSAS',
 'KENTUCKY',
 'LOUISIANA',
 'MAINE',
 'MARYLAND',
 'MASSACHUSETTS',
 'MICHIGAN',
 'MINNESOTA',
 'MISSISSIPPI',
 'MISSOURI',
 'MONTANA',
 'NEBRASKA',
 'NEVADA',
 'NEW HAMPSHIRE',
 'NEW JERSEY',
 'NEW MEXICO',
 'NEW YORK',
 'NORTH CAROLINA',
 'NORTH DAKOTA',
 'OHIO',
 'OKLAHOMA',
 'OREGON',
 'PENNSYLVANIA',
 'PUERTO RICO',
 'RHODE ISLAND',
 'SOUTH CAROLINA',
 'SOUTH DAKOTA',
 'TENNESSEE',
 'TEXAS',
 'UTAH',
 'VERMONT',
 'VIRGIN ISLANDS',
 'VIRGINIA',
 'WASHINGTON',
 'WEST VIRGINIA',
 'WISCONSIN',
 'WYOMING'}

In [25]:
len(set(traffic_tickets.State)) #magic num is 55

55

In [26]:
traffic_tickets.to_pickle('clean_data.pkl')