In [1]:
"""
A class for parsing the input data from the input 
data file to the parsed data file (files specified in the file_handler)
"""

class InputDataParser(object):

    def __init__(self, file_handler):
        self.file_handler = file_handler
        self.date_parser = DateParser()
        self.day_of_week_parser = DayOfWeekParser()
        self.pd_district_parser = PdDistrictParser()
        self.address_type_parser = AddressTypeParser()
        self.location_parser = LocationParser()
        self.category_parser = CategoryParser()
    
    def parse_input_data(self):
        input_data = self.file_handler.get_input_data()
        parsed_data_filename = self.file_handler.get_parsed_data_filename()
        
        #Erases previous contents of the parsed data file
        open(parsed_data_filename, 'w').close() 
        
        with open(parsed_data_filename, "wb") as parsed_data_file:
            writer = csv.writer(parsed_data_file)
            row_idx = 0
            for row in input_data:
                if (row_idx > 0):
                    parsed_row = self.__parse(row)
                    writer.writerow(parsed_row)
                else:
                    row_idx += 1
                
    def parse_test_data(self):
        test_data = self.file_handler.get_test_data()
        parsed_test_data_filename = self.file_handler.get_parsed_test_data_filename()
        
        #Erases previous contents of the parsed data file
        open(parsed_test_data_filename, 'w').close() 
        
        with open(parsed_test_data_filename, "wb") as parsed_test_data_file:
            writer = csv.writer(parsed_test_data_file)
            row_idx = 0
            for row in test_data:
                if (row_idx > 0):
                    parsed_row = self.__parse_test_row(row)
                    writer.writerow(parsed_row)
                else:
                    row_idx += 1
    
    
    def __parse(self, row):
        parsed_row = []
        
        date = row[0]
        day_of_week = row[3]
        pd_district = row[4]
        address = row[6]
        X = row[7]
        Y = row[8]
        category = row[1]
        
        #parsed_row.append(self.date_parser.get_normalized_date(date))
        #parsed_row.append(self.date_parser.get_normalized_hour(date))
        parsed_row.append(self.date_parser.get_awake(date))
        parsed_row += self.date_parser.get_year_vector(date)
        parsed_row += self.date_parser.get_month_vector(date)
        parsed_row += self.date_parser.get_day_vector(date)
        parsed_row += self.date_parser.get_seasons_vector(date)
        parsed_row += self.day_of_week_parser.get_binary_week_vector(day_of_week)
        parsed_row += self.pd_district_parser.get_binary_pd_districts_vector(pd_district)
        #parsed_row += self.address_type_parser.get_binary_address_types_vector(address)
        parsed_row += self.address_type_parser.get_distinct_binary_address_types_vector(address)
        parsed_row.append(self.address_type_parser.get_intersection(address))
        #parsed_row.append(self.location_parser.get_normalized_X(X))
        #parsed_row.append(self.location_parser.get_normalized_Y(Y))
        parsed_row.append(self.category_parser.get_category_value(category))
        
        return parsed_row
    
    
    
    def __parse_test_row(self, row):
        parsed_row = []
        
        date = row[1]
        day_of_week = row[2]
        pd_district = row[3]
        address = row[4]
        X = row[5]
        Y = row[6]
        
        #parsed_row.append(self.date_parser.get_normalized_date(date))
        #parsed_row.append(self.date_parser.get_normalized_hour(date))
        parsed_row.append(self.date_parser.get_awake(date))
        parsed_row += self.date_parser.get_year_vector(date)
        parsed_row += self.date_parser.get_month_vector(date)
        parsed_row += self.date_parser.get_day_vector(date)
        parsed_row += self.date_parser.get_seasons_vector(date)
        parsed_row += self.day_of_week_parser.get_binary_week_vector(day_of_week)
        parsed_row += self.pd_district_parser.get_binary_pd_districts_vector(pd_district)
        #parsed_row += self.address_type_parser.get_binary_address_types_vector(address)
        parsed_row += self.address_type_parser.get_distinct_binary_address_types_vector(address)
        parsed_row.append(self.address_type_parser.get_intersection(address))
        #parsed_row.append(self.location_parser.get_normalized_X(X))
        #parsed_row.append(self.location_parser.get_normalized_Y(Y))
        
        return parsed_row
    
    

In [2]:
import csv
import datetime
import numpy

In [3]:
"""
Class for handling the filenames of each of the files 
needed for the implementation of the neural network, 
and also for reading the input and parsed data files.
"""

class FileHandler(object):

    def __init__(self, input_filename, parsed_data_filename, test_data_filename, parsed_test_data_filename):
        self.input_filename = input_filename
        self.parsed_data_filename = parsed_data_filename
        self.test_data_filename = test_data_filename
        self.parsed_test_data_filename = parsed_test_data_filename

    def get_input_data(self):
        """A generator of the input data"""
        with open(self.input_filename, "rb") as csvfile:
            datareader = csv.reader(csvfile)
            for row in datareader:
                yield row
    
    def get_test_data(self):
        """A generator of the test data"""
        with open(self.test_data_filename, "rb") as csvfile:
            datareader = csv.reader(csvfile)
            for row in datareader:
                yield row
    
    def get_parsed_data(self):
        """A generator of the parsed data"""
        with open(self.parsed_data_filename, "rb") as csvfile:
            datareader = csv.reader(csvfile)
            for row in datareader:
                yield row
    
    def get_parsed_test_data(self):
        """A generator of the parsed test data"""
        with open(self.parsed_test_data_filename, "rb") as csvfile:
            datareader = csv.reader(csvfile)
            for row in datareader:
                yield row
                
    def get_parsed_data_filename(self):
        return self.parsed_data_filename
    
    def get_parsed_test_data_filename(self):
        return self.parsed_test_data_filename

# Input Data Parser

# Date Parser

In [4]:
"""
A class for parsing the date from the
input data into the normalized date and hour features.
"""

class DateParser(object):
    
    MIN_DAY = 1
    MIN_MONTH = 1
    MIN_YEAR = 2003
    MAX_DAY = 13
    MAX_MONTH = 5
    MAX_YEAR = 2015
    
    MIN_HOUR = 0
    MAX_HOUR = 23
    
    JANUARY = 1
    FEBRUARY = 2
    MARCH = 3
    APRIL = 4
    MAY = 5
    JUNE = 6
    JULY = 7
    AUGUST = 8
    SEPTEMBER = 9
    OCTOBER = 10
    NOVEMBER = 11
    DECEMBER = 12
    
    SUMMER_MONTHS = [JUNE, JULY, AUGUST]
    FALL_MONTHS = [SEPTEMBER, OCTOBER, NOVEMBER]
    WINTER_MONTHS = [DECEMBER, JANUARY, FEBRUARY]
    SPRING_MONTHS = [MARCH, APRIL, MAY]
    
    
    def __init__(self):
        self.min_date = datetime.date(self.MIN_YEAR, self.MIN_MONTH, self.MIN_DAY)
        self.max_date = datetime.date(self.MAX_YEAR, self.MAX_MONTH, self.MAX_DAY)
    
    def get_normalized_date(self, input_date):
        date = self.__extract_date(input_date)
        return self.__normalize_date(date)
    
    def get_normalized_hour(self, input_date):
        hour = self.__extract_hour(input_date)
        return self.__normalize_hour(hour)
    
    def __normalize_date(self, date):
        return float((self.elapsed_days(self.min_date, date))) / (self.elapsed_days(self.min_date, self.max_date))
    
    def __normalize_hour(self, hour):
        return float(hour - self.MIN_HOUR) / (self.MAX_HOUR - self.MIN_HOUR)
    
    def elapsed_days(self, date1, date2):
        return abs(date2-date1).days
    
    def __extract_date(self, input_date):
        day = self.__extract_day(input_date)
        month = self.__extract_month(input_date)
        year = self.__extract_year(input_date)
        return datetime.date(year, month, day)
        
    def __extract_year(self, input_date):
        return int(input_date.split('-')[0])
    
    def __extract_month(self, input_date):
        return int(input_date.split('-')[1])
    
    def __extract_day(self, input_date):
        aux = input_date.split('-')[2]
        return int(aux.split(' ')[0])
    
    def __extract_hour(self, input_date):
        aux = input_date.split('-')[2]
        time = aux.split(' ')[1]
        return int(time.split(':')[0])
    
    def get_awake(self, input_date):
        hour = self.__extract_hour(input_date)
        if (hour == 0 or (hour >= 8 and hour <= 23)):
            return 1
        else:
            return 0
    
    def get_seasons_vector(self, input_date):
        month = self.__extract_month(input_date)
        
        seasons = {
            'SUMMER' : 0,
            'FALL' : 1,
            'WINTER' : 2,
            'SPRING' : 3
        }
        
        seasons_vector = [0] * 4
        if (month in self.SUMMER_MONTHS):
            seasons_vector[seasons['SUMMER']] = 1
            
        elif (month in self.FALL_MONTHS):
            seasons_vector[seasons['FALL']] = 1
            
        elif (month in self.WINTER_MONTHS):
            seasons_vector[seasons['WINTER']] = 1
        
        elif (month in self.SPRING_MONTHS):
            seasons_vector[seasons['SPRING']] = 1
            
        return seasons_vector
    
    def get_year_vector(self,input_date):
        year = self.__extract_year(input_date)
        year_vector = [0]*(self.MAX_YEAR-self.MIN_YEAR+1)
        year_vector[year-self.MIN_YEAR] = 1
        
        return year_vector
        
    def get_day_vector(self,input_date):
        day = self.__extract_day(input_date)
        day_vector = [0]*31
        day_vector[day-1] = 1
    
        return day_vector
    
    def get_month_vector(self,input_date):
        month = self.__extract_month(input_date)
        month_vector = [0] * 12
        month_vector[month-1] = 1
        
        return month_vector 

# Test of the DateParser

In [5]:
date_parser = DateParser()
input_date = '2015-12-21 01:53:00'
print date_parser.get_normalized_date(input_date)
print date_parser.get_normalized_hour(input_date)
print date_parser.get_awake(input_date)
print date_parser.get_seasons_vector(input_date)
print date_parser.get_year_vector(input_date)
print date_parser.get_month_vector(input_date)
print date_parser.get_day_vector(input_date)


1.04916943522
0.0434782608696
0
[0, 0, 1, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Day of Week Parser

In [6]:
class DayOfWeekParser(object):
    
    daysOfWeek = {
        'Sunday' : 0,
        'Monday' : 1,
        'Tuesday' : 2,
        'Wednesday' : 3,
        'Thursday' : 4,
        'Friday' : 5,
        'Saturday' : 6
    }
    
    def __init__(self):
        pass
    
    def get_binary_week_vector(self, input_day):
        week = [0] * 7
        week[self.daysOfWeek[input_day]] = 1
        return week
        

# Test of the DayOfWeekParser

In [7]:
day_of_week_parser = DayOfWeekParser()
input_day_of_week = 'Monday'
print day_of_week_parser.get_binary_week_vector(input_day_of_week)

[0, 1, 0, 0, 0, 0, 0]


# District Parser

In [8]:
class PdDistrictParser(object):
    
    pd_districts = {
        'NORTHERN' : 0,
        'PARK' : 1,
        'RICHMOND' : 2,
        'TARAVAL' : 3,
        'CENTRAL' : 4,
        'TENDERLOIN' : 5,
        'MISSION' : 6,
        'BAYVIEW' : 7,
        'SOUTHERN' : 8,
        'INGLESIDE' : 9
    }
    
    def __init__(self):
        pass
    
    def get_binary_pd_districts_vector(self, input_pd_district):
        pd_districts_vector = [0] * 10
        pd_districts_vector[self.pd_districts[input_pd_district]] = 1
        return pd_districts_vector


# Test of the PdDistrictParser

In [9]:
pd_district_parser = PdDistrictParser()
input_pd_district = 'INGLESIDE'
print pd_district_parser.get_binary_pd_districts_vector(input_pd_district)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


# Location Parser

In [10]:
class LocationParser(object):
    
    MIN_X = -122.3647507
    MAX_X = -122.5136421
    MIN_Y = 37.70787902
    MAX_Y = 37.82062084
    
    INVALID_X = -120.5
    INVALID_Y = 90
    
    DEFAULT_VALUE = 0.5
    
    def __init__(self):
        pass
    
    def get_normalized_X(self, input_X):
        X = float(input_X)
        if (X == self.INVALID_X):
            return self.DEFAULT_VALUE
        else:
            return abs(X - self.MIN_X) / abs(self.MAX_X - self.MIN_X)
    
    def get_normalized_Y(self, input_Y):
        Y = float(input_Y)
        if (Y == self.INVALID_Y):
            return self.DEFAULT_VALUE
        else:
            return abs(Y - self.MIN_Y) / abs(self.MAX_Y - self.MIN_Y)
        
        

# Test of the LocationParser

In [11]:
location_parser = LocationParser()
input_X = '-122.43999999999999'
input_Y = '37.7999999999999'
print location_parser.get_normalized_X(input_X)
print location_parser.get_normalized_Y(input_Y)


0.505397222405
0.817096796911


# Category Parser

In [12]:
class CategoryParser(object):
    

    categories = {
        'ARSON' : 0,
        'ASSAULT': 1,
        'BAD CHECKS':2,
        'BRIBERY':3,
        'BURGLARY':4,
        'DISORDERLY CONDUCT':5,
        'DRIVING UNDER THE INFLUENCE':6,
        'DRUG/NARCOTIC':7,
        'DRUNKENNESS':8,
        'EMBEZZLEMENT':9,
        'EXTORTION':10,
        'FAMILY OFFENSES':11,
        'FORGERY/COUNTERFEITING':12,
        'FRAUD':13,
        'GAMBLING':14,
        'KIDNAPPING':15,
        'LARCENY/THEFT':16,
        'LIQUOR LAWS':17,
        'LOITERING':18,
        'MISSING PERSON':19,
        'NON-CRIMINAL':20,
        'OTHER OFFENSES':21,
        'PORNOGRAPHY/OBSCENE MAT':22,
        'PROSTITUTION':23,
        'RECOVERED VEHICLE':24,
        'ROBBERY':25,
        'RUNAWAY':26,
        'SECONDARY CODES':27,
        'SEX OFFENSES FORCIBLE':28,
        'SEX OFFENSES NON FORCIBLE':29,
        'STOLEN PROPERTY':30,
        'SUICIDE':31,
        'SUSPICIOUS OCC':32,
        'TREA':33,
        'TRESPASS':34,
        'VANDALISM':35,
        'VEHICLE THEFT':36,
        'WARRANTS':37,
        'WEAPON LAWS':38
    }

    
    def __init__(self):
        pass
    
    def get_category_value(self, input_category):
        return self.categories[input_category]
        

# Test of the CategoryParser

In [13]:
category_parser = CategoryParser()
input_category = 'RUNAWAY'
print category_parser.get_category_value(input_category)

26


# Address Type Parser

In [14]:
class AddressTypeParser(object):
    
    ALLEY = 0
    AVENUE = 1
    HIGHWAY = 2
    LANE = 3
    PLAZA = 4
    STAIRWAY = 5
    STREET = 6
    WALK = 7
     
    address_types_prefixes = {
        'I-'   : HIGHWAY,
    }
    
    address_types_suffixes = {
        'AL'   : ALLEY,
        'AV'   : AVENUE,
        'BL'   : AVENUE,
        'WAY'  : AVENUE,
        'WY'   : AVENUE,
        'EX'   : HIGHWAY,
        'HWY'  : HIGHWAY,
        'HY'   : HIGHWAY,
        'LN'   : LANE,
        'PZ'   : PLAZA,
        'STWY' : STAIRWAY,
        'CR'   : STREET,
        'CT'   : STREET,
        'DR'   : STREET,
        'PARK' : STREET,
        'PL'   : STREET,
        'RD'   : STREET,
        'RW'   : STREET,
        'ST'   : STREET,
        'TER'  : STREET,
        'TR'   : STREET,
        'WK'   : WALK
    }
    
    def __init__(self):
        pass
    
    def flag_address_types_vector_with_street_type(self, address_types_vector, street):
        address_type_found = False
        for address_type in self.address_types_suffixes:
            address_type_found = (street.endswith(address_type))
            if (address_type_found):
                address_types_vector[self.address_types_suffixes[address_type]] = 1
                break
        if (not address_type_found):
            for address_type in self.address_types_prefixes:
                address_type_found = (street.startswith(address_type))
                if (address_type_found):
                    address_types_vector[self.address_types_prefixes[address_type]] = 1
                    break
        
    
    def get_distinct_binary_address_types_vector(self, input_address):
        address_types_vector = [0] * 8
        intersection_types_vector = [0] * 8
        
        if (' / ' in input_address):
            intersections = input_address.split(' / ')
            street_1 = intersections[0]
            street_2 = intersections[1]
            self.flag_address_types_vector_with_street_type(address_types_vector, street_1)
            self.flag_address_types_vector_with_street_type(intersection_types_vector, street_2)
        else:
            self.flag_address_types_vector_with_street_type(address_types_vector, input_address)
        
        return address_types_vector + intersection_types_vector
    
    def get_binary_address_types_vector(self, input_address):
        address_types_vector = [0] * 8
        intersection_types_vector = [0] * 8
        
        if (' / ' in input_address):
            intersections = input_address.split(' / ')
            street_1 = intersections[0]
            street_2 = intersections[1]
            self.flag_address_types_vector_with_street_type(address_types_vector, street_1)
            self.flag_address_types_vector_with_street_type(intersection_types_vector, street_2)
        else:
            self.flag_address_types_vector_with_street_type(address_types_vector, input_address)
            self.flag_address_types_vector_with_street_type(intersection_types_vector, input_address)
        
        return address_types_vector + intersection_types_vector
    
    def get_intersection(self, input_address):
        if ' / ' in input_address:
            return 1
        else:
            return 0
    

# Test of the AddressTypeParser

In [15]:
address_type_parser = AddressTypeParser()
input_address = '19TH AV / WINSTON DR'
print address_type_parser.get_binary_address_types_vector(input_address)
print address_type_parser.get_intersection(input_address)

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
1


# Sample data printer (for debbuging)
A sample of the data is printed from the data generator

In [16]:
def print_sample(data):
    count = 0
    for row in data:
        print row
        count += 1
        if (count > 100):
            break

# Creation of the file handler
The file handler is created with all the necessary files for the algorithm

In [17]:
input_filename = '../data/train.csv'
parsed_data_filename = '../data/parsed_train.csv'
test_data_filename = '../data/test.csv'
parsed_test_data_filename = '../data/parsed_test.csv'

file_handler = FileHandler(input_filename, parsed_data_filename, test_data_filename, parsed_test_data_filename)

# Data parsing
The data from the input file is parsed to the parsed data file, with the necessary format for the neural network features.

In [18]:
data_parser = InputDataParser(file_handler)

In [19]:
data_parser.parse_input_data()

In [20]:
data_parser.parse_test_data()

# Parsed data sample (for debbuging)
A sample of rows from the parsed data file is printed 

In [55]:
parsed_data = file_handler.get_parsed_data()
print_sample(parsed_data)

['1.0', '1.0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0.4106414147224084', '0.5917908339133205', '37']
['1.0', '1.0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0.4106414147224084', '0.5917908339133205', '21']
['1.0', '1.0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0.4003745108851507', '0.8207717596327773', '21']
['1.0', '1.0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0.41805387528083976', '0.8248368952108088', '16']
['1.0', '1.0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0.49691871227621504', '0.5646720272725474', '16']
['1.0', '1.0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0.2585888856711039', '0.04924245604692298', '16']
['1.0', '1.0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0