In [31]:
import pandas as pd
import numpy as np
import re

In [2]:
acc = pd.read_csv('accused.csv.gz')
ilo = pd.read_csv('il-officers.csv.gz')
asw = pd.read_csv('all-sworn.csv.gz')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
gender_df = pd.read_csv('../hand/gender_dictionary.csv')
race_df = pd.read_csv('../hand/race_dictionary.csv')
gender_dict = dict(zip(gender_df.Original, gender_df.Standard))
race_dict = dict(zip(race_df.Original, race_df.Standard))

In [13]:
def list_diff(l1, l2):
    '''returns list after taking set difference of two lists'''
    return list(set(l1) - set(l2))

def intersect(l1, l2):
    '''returns list after taking set intersection of two lists'''
    return list(set(l1) & set(l2))

def clean_int(x, na_value=np.nan):
    '''returns an integer from an object if possible,
       else returns an na_value
    '''
    if isinstance(x, str):
        # Check to see if it the string may be a float
        if re.search('^[0-9,.]*$', x):
            return int(float(x))
        # If not, return na_value
        else:
            return na_value
    elif np.isfinite(x):
        return int(float(x))
    else:
        return na_value
    
def standardize_gender(x):
    '''returns a standardized gender string
       by passing input string into gender reference dictionary
    '''
    # Ensure input gender (x) is string
    if isinstance(x, str):
        x = x.upper()   # Change x to uppercase
        # Check if x is already standardized
        if x in gender_dict.values():
            return x
        # If not, pass x into gender dictionary for standardization
        else:
            return gender_dict[x]
    # If not a string return 'NAN'
    else:
        return gender_dict['NAN']

def standardize_race(x):
    '''returns a standardized race string
       by passing input string into race reference dictionary
    '''
    # Ensure input race (x) is string
    if isinstance(x, str):
        x = x.upper()   # Change x to uppercase
        # Check if x is already standardized
        if x in race_dict.values():
            return x
        # If not, pass x in race dictionary for standardization
        else:
            return race_dict[x]
    # If not a string return 'NAN'
    else:
        return race_dict['NAN']
    
def clean_date_df(df):
    '''returns pandas dataframe of cleaned date and time columns
       splits datetime columns into date and time,
       ensures any errors are returned as null.
    '''
    # Store column names
    # All columns must end with .Date or .Datetime
    df_cols = df.columns.values  # Store column names
    dt_df = pd.DataFrame()  # Create empty dataframe

    # Iterate over stored date(time) columns
    for col in df_cols:
        # Store column suffix as list, removing .Date(time) ending
        # 'Org.Hire.Datetime' -> ['Org', 'Hire']
        col_suffix = col.split('.')[:-1]
        # Try to convert column to datetime
        # And create column in dt_df ending with .Date
        try:
            dt_df['.'.join(col_suffix + ["Date"])] = \
                pd.to_datetime(df[col], errors='raise').dt.date
        # If there were errors, notify the user,
        # And repeat above but with coercing errors to NaT
        except:
            print('Some errors in {}. Returned as NaT.'.format(col))
            dt_df['.'.join(col_suffix + ["Date"])] = \
                pd.to_datetime(df[col], errors='coerce').dt.date

        # If time is in column, repeat above
        # Except convert to time not date
        if 'time' in col:
            try:
                dt_df['.'.join(col_suffix + ["Time"])] = \
                    pd.to_datetime(df[col]).dt.time
            except:
                print('Some errors in {}. Returned as NaT.'.format(col))
                dt_df['.'.join(col_suffix + ["Time"])] = \
                    pd.to_datetime(df[col], errors='coerce').dt.date

    # EX: df columns = ['Org.Hire.Datetime', 'Start.Date']
    #     dt_df columns = ['Org.Hire.Date', 'Org.Hire.Time', 'Start.Date']
    return dt_df

In [16]:
name_cols = ['Full.Name','First.Name', 'Last.Name', 
             'Middle.Initial', 'Middle.Name',
             'Suffix.Name']

In [17]:
print(acc.columns)
print(name_cols)
acc_nc = intersect(acc.columns, name_cols)
acc_nc

Index(['CRID', 'Full.Name', 'Birth.Year', 'Gender', 'Race', 'Appointed.Date',
       'Current.Unit', 'Current.Rank', 'Current.Star', 'Complaint.Category',
       'Recommended.Finding', 'Recommended.Discipline', 'Final.Finding',
       'Final.Discipline'],
      dtype='object')
['Full.Name', 'First.Name', 'Last.Name', 'Middle.Initial', 'Middle.Name', 'Suffix.Name']


['Full.Name']

In [37]:
accn = acc[acc_nc[0]]
accn = accn.fillna(',')
accn = accn.map(lambda x: x if re.search('[a-zA-Z]', x) else ",")
accn = accn.map(lambda x: x.rsplit(',', 1))
accn = pd.DataFrame(accn.values.tolist(),
                         columns=['Last.Name', 'First.Name'])

In [38]:
accn

Unnamed: 0,Last.Name,First.Name
0,BARRON,WILLIAM
1,C0NNOLLY,KIMBERLY
2,KEENE,JOHN
3,SLAVIN,SCOTT
4,MARTINEZ,ANTONIO
5,JONES,MICHAEL
6,BROWN,CORNELIUS
7,MOLESKY,KENNETH
8,TERRONES,SOFIA
9,BRILL,JAMES
