# MIDS W207 Fall 2017 Final Project¶
## Data Set Up - Data Cleaning and Feature Engineering
Laura Williams, Kim Vignola, Cyprian Gascoigne  
SF Crime Classification

This notebook reads raw data (saved in a zip file) from Kaggle, processes and organizes the data for training a variety of machine learning models, and outputs the data as zipped csv files that other notebooks can unzip and use to train different models.

The intention is that data cleaning and/or feature engineering will be added to this file as we progress through the project and look for additional way to process the data to improve our predictions.

For ease of processing this data, exploratory data analysis will be in a separate notebook.

Single zipped output file (called data.zip) includes:  

1) train_data.csv and train_labels.csv - includes 80% of the total training data, for training models that are not yet going to be submitted to Kaggle

2) dev_data.csv and dev_labels.csv - includes 20% of the total training data, for testing models before they are submitted to Kaggle

3) train_data_all.csv and train_labels_all.csv - includes all the training data. After testing models with the train and dev data split above, train the model from this full set of data for submission to Kaggle.

4) test_data_all.csv - create predictions on this data for submission to Kaggle.

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import zipfile

In [2]:
# Unzip raw data into a subdirectory 
unzip_files = zipfile.ZipFile("raw_data.zip", "r")
unzip_files.extractall("raw_data")
unzip_files.close()

In [3]:
# Read CSV files into pandas dataframes
train = pd.read_csv("raw_data/train.csv")
test = pd.read_csv("raw_data/test.csv")
weather = pd.read_csv("raw_data/SF_county.csv")

In [5]:
# import datetime and holiday modules (note this takes a a few minutes to run)
from datetime import datetime, timedelta, date
import holidays

# extract month, year and hour from both datasets
train["month"] = train["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S").month)
train["year"] = train["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S").year)
train["hour"] = train["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S").hour)
#train["day"] = train["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S").day)

test["month"] = test["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S").month)
test["year"] = test["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S").year)
test["hour"] = test["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S").hour)
#test["day"] = test["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S").day)

# map holidays
US_Holidays = holidays.UnitedStates()
train["holidays"] = train["Dates"].map(lambda x: x in US_Holidays)
test["holidays"] = test["Dates"].map(lambda x: x in US_Holidays)

In [18]:
#from datetime import timedelta, date I Will mess around with this later
#print("2015-1-1" in US_Holidays)
#print(date(2015,1,1)in US_Holidays)
#print(date(train["year"], train["month"], 1) + timedelta(days = 1))

# should also likely include seasons.  This is just a placeholder...

#seasons = {'Summer':(datetime(year,6,21), datetime(year,9,22)),
#           'Autumn':(datetime(year,9,23), datetime(year,12,20)),
#           'Spring':(datetime(year,3,21), datetime(year,6,20)),
#           'Winter':(datetime(year,3,21), datetime(year,6,20))}

In [19]:
# create a dictionary for dayparts
time_periods = {6:"early_morning", 7:"early_morning", 8:"early_morning", 
               9:"late_morning", 10:"late_morning", 11:"late_morning",
              12:"early_afternoon", 13:"early_afternoon", 14:"early_afternoon",
              15:"late_afternoon", 16:"late_afternoon", 17:"late_afternoon",
              18:"early_evening",  19:"early_evening",  20:"early_evening",
              21:"late_evening", 22:"late_evening", 23:"late_evening",
              0:"late_night", 1:"late_night", 2:"late_night",
              3:"late_night", 4:"late_night", 5:"late_night"}

# map time periods
train["dayparts"] = train["hour"].map(time_periods)
test["dayparts"] = test["hour"].map(time_periods)

In [7]:
# clean up weather data
del weather['NAME']
weather["SNOW"] = weather["SNOW"].fillna(0)

In [8]:
# drop time from train and test date fields to be able to map Dates against weather data; remove hyphens too.
train["Dates"] = train["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))
train["Dates"] = train["Dates"].map(lambda x: datetime.strftime(x,"%Y%m%d"))
test["Dates"] = test["Dates"].map(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))
test["Dates"] = test["Dates"].map(lambda x: datetime.strftime(x,"%Y%m%d"))

# Convert Weather DATE to same format as train and test data
weather["DATE"] = weather["DATE"].map(lambda x: datetime.strptime(x,"%m/%d/%y"))
weather["DATE"] = weather["DATE"].map(lambda x: datetime.strftime(x,"%Y%m%d"))

In [9]:
# confirm that dates are now in the same format
print(train["Dates"][0], train["Dates"].dtypes)
print(test["Dates"][0], test["Dates"].dtypes)
print(weather["DATE"][0], weather["DATE"].dtypes)

20150513 object
20150510 object
20030101 object


In [10]:
# convert date objects to numeric
train["Dates"] = pd.to_numeric(train["Dates"])
test["Dates"] = pd.to_numeric(test["Dates"])
weather["DATE"] = pd.to_numeric(weather["DATE"])
print(type(train["Dates"][0]))

<class 'numpy.int64'>


In [11]:
# left merge weather data based on dates
weather_train = pd.merge(train, weather, how='left', left_on="Dates", right_on = "DATE")
del weather_train['DATE']
weather_test = pd.merge(test, weather, how='left', left_on="Dates", right_on = "DATE")
del weather_test['DATE']

Next, remove outliers.

In [11]:
# Data indicates outliers with latitude = 90 (aka the North Pole). Test data has these same outliers.
# weather_train.Y.describe()

count    878049.000000
mean         37.771020
std           0.456893
min          37.707879
25%          37.752427
50%          37.775421
75%          37.784369
max          90.000000
Name: Y, dtype: float64

In [30]:
# Instead of omitting columns with erroneous lat/long data putting this as a placehold to look into replacing based on district

In [12]:
# remove cells where latitude > 38
# weather_train = weather_train[weather_train.Y < 38]
# weather_test = weather_test[weather_test.Y < 38]

In [12]:
# print new shape
# print(weather_train.shape)
#print(weather_test.shape)

# print("Cases removed from train data =", np.sum(878049 - weather_train.shape[0]))
# print("Cases removed from test data =", np.sum(884262 - weather_test.shape[0]))

(878049, 18)
(884262, 16)
Cases removed from train data = 0
Cases removed from test data = 0


In [13]:
print(train.columns)
print(weather_train.columns)
print(test.columns)
print(weather_test.columns)

Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y', 'month', 'year', 'hour', 'holidays',
       'dayparts'],
      dtype='object')
Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y', 'month', 'year', 'hour', 'holidays',
       'dayparts', 'PRCP', 'SNOW', 'TMAX', 'TMIN'],
      dtype='object')
Index(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'month',
       'year', 'hour', 'holidays', 'dayparts'],
      dtype='object')
Index(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'month',
       'year', 'hour', 'holidays', 'dayparts', 'PRCP', 'SNOW', 'TMAX', 'TMIN'],
      dtype='object')


In [15]:
#weather_train["holidays"] = pd.to_numeric(weather_train["holidays"])
#weather_test["holidays"] = pd.to_numeric(weather_test["holidays"])

weather_train["holidays"] = weather_train["holidays"].astype(int)
weather_test["holidays"] = weather_test["holidays"].astype(int)
print(type(weather_train["holidays"][0]))
print(type(weather_test["holidays"][0]))


<class 'numpy.int64'>
<class 'numpy.int64'>


In [14]:
# Encode string features into numeric features
LE = preprocessing.LabelEncoder()

train_data_all = pd.get_dummies(weather_train, columns = ["PdDistrict", "DayOfWeek", "month", "year", "dayparts", "hour"])
del train_data_all["Descript"]
del train_data_all["Resolution"]
#del train_data_all["day"]
train_labels_all = np.array(train_data_all['Category'])
del train_data_all["Category"]

train_data_all["Dates"] = LE.fit_transform(train_data_all["Dates"])
train_data_all["Address"] = LE.fit_transform(train_data_all["Address"])
train_data_all.reindex()

test_data_all = pd.get_dummies(weather_test, columns = ["PdDistrict", "DayOfWeek", "month", "year", "dayparts", "hour"])

test_data_all["Dates"] = LE.fit_transform(test_data_all["Dates"])
test_data_all["Address"] = LE.fit_transform(test_data_all["Address"])
del test_data_all["Id"]
#del test_data_all["day"]
                                 
print(test_data_all.columns == train_data_all.columns)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]


In [15]:
print(train_data_all.head(3))

   Dates  Address           X          Y  holidays  PRCP  SNOW  TMAX  TMIN  \
0   2248    19790 -122.425892  37.774599     False   0.0   0.0    61    50   
1   2248    19790 -122.425892  37.774599     False   0.0   0.0    61    50   
2   2248    22697 -122.424363  37.800414     False   0.0   0.0    61    50   

   PdDistrict_BAYVIEW   ...     hour_14  hour_15  hour_16  hour_17  hour_18  \
0                   0   ...           0        0        0        0        0   
1                   0   ...           0        0        0        0        0   
2                   0   ...           0        0        0        0        0   

   hour_19  hour_20  hour_21  hour_22  hour_23  
0        0        0        0        0        1  
1        0        0        0        0        1  
2        0        0        0        0        1  

[3 rows x 82 columns]


In [29]:
# First pass at normalization -- it doesn't give errors but it just endlessly processes.

for i in train_data_all.columns:
    col_mean = (train_data_all.loc[:]).mean()
    col_sd = (train_data_all.loc[:]).std()
    train_data_all[i] = train_data_all[i].apply(lambda x: (x - col_mean ) / (col_sd ))
    

KeyboardInterrupt: 

In [32]:
print(col_sd)

Dates                       659.900837
Address                    6046.321721
X                             0.025285
Y                             0.024165
holidays                      0.175814
PRCP                          0.208314
SNOW                          0.000000
TMAX                          7.446029
TMIN                          4.475019
PdDistrict_BAYVIEW            0.302450
PdDistrict_CENTRAL            0.296408
PdDistrict_INGLESIDE          0.285892
PdDistrict_MISSION            0.343394
PdDistrict_NORTHERN           0.324863
PdDistrict_PARK               0.230238
PdDistrict_RICHMOND           0.220983
PdDistrict_SOUTHERN           0.383367
PdDistrict_TARAVAL            0.262919
PdDistrict_TENDERLOIN         0.290659
DayOfWeek_Friday              0.359319
DayOfWeek_Monday              0.345391
DayOfWeek_Saturday            0.351522
DayOfWeek_Sunday              0.339488
DayOfWeek_Thursday            0.349463
DayOfWeek_Tuesday             0.349378
DayOfWeek_Wednesday      

In [17]:
# Shuffle data and set aside 20% as development data
train_data_all = train_data_all.values
test_data_all = test_data_all.values
n = train_data_all.shape[0]

shuffle = np.random.permutation(np.arange(train_data_all.shape[0]))

train_data_all = train_data_all[shuffle]
train_labels_all = train_labels_all[shuffle]

n_train = int(0.8*n)

train_data = train_data_all[:n_train,:]
train_labels = train_labels_all[:n_train]
dev_data = train_data_all[n_train:,:]
dev_labels = train_labels_all[n_train:]

In [18]:
# print shapes and some data to compare before and after csv conversion
print("train_data shape is", train_data.shape)
print("train_labels shape is", train_labels.shape)
print("dev_data shape is", dev_data.shape)
print("dev_labels shape is", dev_labels.shape)
print("train_data_all shape is", train_data_all.shape)
print("train_data_all shape is", train_data_all.shape)
print("train_labels_all shape is", train_labels_all.shape)
print("test_data_all shape is", test_data_all.shape)

train_data shape is (702385, 59)
train_labels shape is (702385,)
dev_data shape is (175597, 59)
dev_labels shape is (175597,)
train_data_all shape is (877982, 59)
train_data_all shape is (877982, 59)
train_labels_all shape is (877982,)
test_data_all shape is (884186, 59)


In [19]:
# Save arrays as CSV files in a subdirectory

# NOTE: mkdir will make a "csv" directory in your local repo if there is not already one there.
# It will return an error if the directory already exists in your local repo
# but that will not impact how this code runs

! mkdir csv
np.savetxt("csv/train_data.csv", train_data, delimiter=",")
np.savetxt("csv/train_labels.csv", train_labels, fmt="%s", delimiter=",")
np.savetxt("csv/dev_data.csv", dev_data, delimiter=",")
np.savetxt("csv/dev_labels.csv", dev_labels, fmt="%s", delimiter=",")
#np.savetxt("csv/train_data_all.csv", train_data_all, delimiter=",")
#np.savetxt("csv/train_labels_all.csv", train_labels_all, fmt="%s", delimiter=",")
np.savetxt("csv/test_data_all.csv", test_data_all, delimiter=",")

In [20]:
# Zip up the CSV files

# **IMPORTANT**  This code will rewrite the existing data.zip file in your local repo
# You will need to push it to the group repo for everyone to have the updated zip file

zip_files = zipfile.ZipFile("data.zip", "w")
zip_files.write("csv/train_data.csv", compress_type=zipfile.ZIP_DEFLATED)
zip_files.write("csv/train_labels.csv", compress_type=zipfile.ZIP_DEFLATED)
zip_files.write("csv/dev_data.csv", compress_type=zipfile.ZIP_DEFLATED)
zip_files.write("csv/dev_labels.csv", compress_type=zipfile.ZIP_DEFLATED)
#zip_files.write("csv/train_data_all.csv", compress_type=zipfile.ZIP_DEFLATED)
#zip_files.write("csv/train_labels_all.csv", compress_type=zipfile.ZIP_DEFLATED)
zip_files.write("csv/test_data_all.csv", compress_type=zipfile.ZIP_DEFLATED)
zip_files.close()

In [22]:
# Get baseline KNN accuracy with new features

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 1, n_jobs = -1)
neigh.fit(train_data, train_labels)
knn_pred = neigh.predict(dev_data)
print("KNN Accuracy = ", np.mean(knn_pred == dev_labels))


KNN Accuracy =  0.223785144393
