In [35]:
import pandas as pd
import numpy as np
import datetime

In [46]:
def str2date(n):
    d = n.split('/')
    return datetime.date(year=int(d[2]), month=int(d[0]), day=int(d[1]))

def str2time(n):
    res = None
    if (len(n) == 4):
      res = datetime.time(hour=int(n[0:2]), minute=int(n[2:4]))
    elif (len(n) == 1):
      res = datetime.time(hour=int(n))  
    elif (len(n) == 2 and int(n) < 24):
      res = datetime.time(hour=int(n))  
    elif (len(n) == 2 and int(n) > 23):
      res = datetime.time(hour=0, minute=int(n))  
    else:
      res = datetime.time(hour=int(n[0:1]), minute=int(n[1:3]))
    return res

gl = pd.read_csv('Crime_Data_from_2010.csv', converters = {
    'Date Reported': str2date,
    'Date Occurred': str2date,
    'Time Occurred': str2time
})
gl.dtypes

DR Number                   int64
Date Reported              object
Date Occurred              object
Time Occurred              object
Area ID                     int64
Area Name                  object
Reporting District          int64
Crime Code                  int64
Crime Code Description     object
MO Codes                   object
Victim Age                float64
Victim Sex                 object
Victim Descent             object
Premise Code              float64
Premise Description        object
Weapon Used Code          float64
Weapon Description         object
Status Code                object
Status Description         object
Crime Code 1              float64
Crime Code 2              float64
Crime Code 3              float64
Crime Code 4              float64
Address                    object
Cross Street               object
Location                   object
dtype: object

In [37]:
gl.memory_usage(index = True, deep = True)

Index                           128
DR Number                  13536448
Date Reported              67682240
Date Occurred              67682240
Time Occurred              81218688
Area ID                    13536448
Area Name                 110615267
Reporting District         13536448
Crime Code                 13536448
Crime Code Description    145197495
MO Codes                  110804484
Victim Age                 13536448
Victim Sex                 94083742
Victim Descent             94082806
Premise Code               13536448
Premise Description       126865579
Weapon Used Code           13536448
Weapon Description         87528284
Status Code                99831250
Status Description        115211361
Crime Code 1               13536448
Crime Code 2               13536448
Crime Code 3               13536448
Crime Code 4               13536448
Address                   156622073
Cross Street               66607119
Location                  129821014
dtype: int64

In [49]:
gl_obj = gl.select_dtypes(include=['object']).copy()

converted_obj = pd.DataFrame()

# пишем цикл, которой перебирает каждый столбец object, 
# проверяет его на соответствие заданному порогу 
# (количество уникальных значений должно быть меньше 50% 
# от общего количества значений), и если столбец 
# удовлетворяет порогу, преобразовывает его в тип category
for col in gl_obj.columns:
    num_unique_values = len(gl_obj[col].unique())
    num_total_values = len(gl_obj[col])
    if (num_unique_values / num_total_values < 0.5) and (num_unique_values < 1000):
        converted_obj.loc[:,col] = gl_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = gl_obj[col]
        
gl[converted_obj.columns] = converted_obj        

In [50]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # предположим, что если это не датафрейм, то серия
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # преобразуем байты в мегабайты
    return "{:03.2f} MB".format(usage_mb)

# снова применяем функцию mem_usage, смотрим,
# сколько памяти занимают все столбцы типа object
# до и после преобразования в тип category
print(mem_usage(gl_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([gl_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

1481.87 MB
665.12 MB


Unnamed: 0,before,after
object,15.0,7
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1


In [51]:
converted_obj.head()

Unnamed: 0,Date Reported,Date Occurred,Time Occurred,Area Name,Crime Code Description,MO Codes,Victim Sex,Victim Descent,Premise Description,Weapon Description,Status Code,Status Description,Address,Cross Street,Location
0,2013-03-14,2013-03-11,18:00:00,77th Street,INTIMATE PARTNER - SIMPLE ASSAULT,0416 0446 1243 2000,F,W,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)","STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AO,Adult Other,6300 BRYNHURST AV,,"(33.9829, -118.3338)"
1,2010-01-25,2010-01-22,23:00:00,Olympic,VEHICLE - STOLEN,,,,STREET,,IC,Invest Cont,VAN NESS,15TH,"(34.0454, -118.3157)"
2,2013-03-19,2013-03-18,20:30:00,Southeast,VEHICLE - STOLEN,,,,STREET,,IC,Invest Cont,200 E 104TH ST,,"(33.942, -118.2717)"
3,2010-11-11,2010-11-10,18:00:00,Southeast,VEHICLE - STOLEN,,,,STREET,,IC,Invest Cont,88TH,WALL,"(33.9572, -118.2717)"
4,2014-01-11,2014-01-04,23:00:00,Topanga,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329,M,W,SINGLE FAMILY DWELLING,,IC,Invest Cont,7200 CIRRUS WY,,"(34.2009, -118.6369)"


In [52]:
occured = []
for x in range(len(gl.index)):
  occured.append(datetime.datetime.combine(gl['Date Occurred'][x], gl['Time Occurred'][x]))
gl['Occured'] = occured

gl.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Status Code,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,Occured
0,1208575,2013-03-14,2013-03-11,18:00:00,12,77th Street,1241,626,INTIMATE PARTNER - SIMPLE ASSAULT,0416 0446 1243 2000,...,AO,Adult Other,626.0,,,,6300 BRYNHURST AV,,"(33.9829, -118.3338)",2013-03-11 18:00:00
1,102005556,2010-01-25,2010-01-22,23:00:00,20,Olympic,2071,510,VEHICLE - STOLEN,,...,IC,Invest Cont,510.0,,,,VAN NESS,15TH,"(34.0454, -118.3157)",2010-01-22 23:00:00
2,418,2013-03-19,2013-03-18,20:30:00,18,Southeast,1823,510,VEHICLE - STOLEN,,...,IC,Invest Cont,510.0,,,,200 E 104TH ST,,"(33.942, -118.2717)",2013-03-18 20:30:00
3,101822289,2010-11-11,2010-11-10,18:00:00,18,Southeast,1803,510,VEHICLE - STOLEN,,...,IC,Invest Cont,510.0,,,,88TH,WALL,"(33.9572, -118.2717)",2010-11-10 18:00:00
4,42104479,2014-01-11,2014-01-04,23:00:00,21,Topanga,2133,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329,...,IC,Invest Cont,745.0,,,,7200 CIRRUS WY,,"(34.2009, -118.6369)",2014-01-04 23:00:00


In [53]:
gl.dtypes

DR Number                          int64
Date Reported                     object
Date Occurred                     object
Time Occurred                     object
Area ID                            int64
Area Name                       category
Reporting District                 int64
Crime Code                         int64
Crime Code Description          category
MO Codes                          object
Victim Age                       float64
Victim Sex                      category
Victim Descent                  category
Premise Code                     float64
Premise Description             category
Weapon Used Code                 float64
Weapon Description              category
Status Code                     category
Status Description              category
Crime Code 1                     float64
Crime Code 2                     float64
Crime Code 3                     float64
Crime Code 4                     float64
Address                           object
Cross Street    