In [1]:
import pandas as pd
import requests


In [2]:
# Import & read CSV file

csv_data = '../test-data.csv'


# Create dataframe based on CSV file

data = pd.read_csv(csv_data)

list(data.columns)

['raw_row_number',
 'date',
 'time',
 'location',
 'lat',
 'lng',
 'district',
 'subject_age',
 'subject_race',
 'subject_sex',
 'type',
 'arrest_made',
 'citation_issued',
 'outcome',
 'contraband_found',
 'search_conducted',
 'search_vehicle',
 'search_basis',
 'reason_for_stop',
 'raw_search_vehicle_description',
 'raw_result_of_contact_description']

In [3]:
# Remove unwanted columns

df = pd.DataFrame(data)

df = df.drop(columns=['raw_row_number',
                      'location',
                      'district',
                      'type',
                      'citation_issued',
                      'contraband_found',
                      'search_conducted',
                      'search_vehicle',
                      'reason_for_stop',
                      'raw_search_vehicle_description',
                      'raw_result_of_contact_description',
                      'search_basis',
                      ])

df.head()

Unnamed: 0,date,time,lat,lng,subject_age,subject_race,subject_sex,arrest_made,warning_issued,outcome
0,1/1/2015,1:00:00,37.736362,-122.390097,23,black,male,False,True,warning
1,1/1/2015,1:00:00,37.742207,-122.422097,30,hispanic,female,False,True,warning
2,1/1/2015,1:00:00,37.745603,-122.419898,35,white,male,False,False,citation
3,1/1/2015,1:00:00,37.782485,-122.424125,44,white,male,False,False,citation
4,1/1/2015,1:00:00,37.742822,-122.481078,60,white,male,False,True,warning


In [4]:
# Fill in NA values with unknown values

values = {'date': "1/1/1900", 
          'time': "00:00:00",  
          'subject_age': 0, 
          'subject_race':"other", 
          'subject_sex': "unknown", 
          'arrest_made':"unknown", 
          'warning_issued': "unknown", 
          'outcome': "unknown"}

df1 = df.fillna(value=values)

df1.head(-1)


Unnamed: 0,date,time,lat,lng,subject_age,subject_race,subject_sex,arrest_made,warning_issued,outcome
0,1/1/2015,1:00:00,37.736362,-122.390097,23,black,male,False,True,warning
1,1/1/2015,1:00:00,37.742207,-122.422097,30,hispanic,female,False,True,warning
2,1/1/2015,1:00:00,37.745603,-122.419898,35,white,male,False,False,citation
3,1/1/2015,1:00:00,37.782485,-122.424125,44,white,male,False,False,citation
4,1/1/2015,1:00:00,37.742822,-122.481078,60,white,male,False,True,warning
...,...,...,...,...,...,...,...,...,...,...
93,1/1/2015,19:20:00,37.780474,-122.475113,21,black,female,False,False,citation
94,1/1/2015,19:22:00,37.714951,-122.442500,20,hispanic,female,False,False,citation
95,1/1/2015,19:27:00,37.763640,-122.416281,21,black,male,False,True,warning
96,1/1/2015,19:29:00,37.714688,-122.447335,20,asian/pacific islander,female,False,True,warning


In [5]:
#Removed rows where lat/long were NA. This removed 91 rows.

df2 = df1.dropna()

df2.head(-1)

Unnamed: 0,date,time,lat,lng,subject_age,subject_race,subject_sex,arrest_made,warning_issued,outcome
0,1/1/2015,1:00:00,37.736362,-122.390097,23,black,male,False,True,warning
1,1/1/2015,1:00:00,37.742207,-122.422097,30,hispanic,female,False,True,warning
2,1/1/2015,1:00:00,37.745603,-122.419898,35,white,male,False,False,citation
3,1/1/2015,1:00:00,37.782485,-122.424125,44,white,male,False,False,citation
4,1/1/2015,1:00:00,37.742822,-122.481078,60,white,male,False,True,warning
...,...,...,...,...,...,...,...,...,...,...
93,1/1/2015,19:20:00,37.780474,-122.475113,21,black,female,False,False,citation
94,1/1/2015,19:22:00,37.714951,-122.442500,20,hispanic,female,False,False,citation
95,1/1/2015,19:27:00,37.763640,-122.416281,21,black,male,False,True,warning
96,1/1/2015,19:29:00,37.714688,-122.447335,20,asian/pacific islander,female,False,True,warning


In [6]:
df2.dtypes

date               object
time               object
lat               float64
lng               float64
subject_age         int64
subject_race       object
subject_sex        object
arrest_made          bool
outcome            object
dtype: object

In [7]:
# Export dataframe to csv

# df2.to_csv('cleaned_version.csv',index=False)

In [8]:
# Export Data frame to mongodb

import pymongo

conn = 'mongodb://localhost:27017'

client = pymongo.MongoClient(conn)

db = client.test_db

collection = db.stops

db.collection.insert_many(df2.to_dict("records"))

<pymongo.results.InsertManyResult at 0x2354d281548>

In [9]:
# Loop through created mongodb to ensure creation of database .

# police_stops = db.collection.find()
# for stop in police_stops:
#     print(stop)

