In [1]:
# importing required packages

import pickle
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder

In [2]:
def preprocessing(df, name):

    """
    preprocessing train and test data. The preprocessing 
    is specific to this dataset and the function described 
    below can't be directly applied to another dataset.

    input: 
        df: dataframe of train/test data
        name: can be 'train' or 'test'

    returns:
        df: processed dataframe
    """

    ## setting index
    df.set_index('animal_id_outcome', inplace=True)

    ## filling NaN values in outcome_datetime
    in_datetime = df['intake_datetime']
    days_spent = df['time_in_shelter_days']

    if name=='train':
        dt_format = '%Y-%m-%d %H:%M:%S'
    elif name=='test':
        dt_format = '%d-%m-%Y %H:%M'

    in_datetime = in_datetime.apply(lambda x: datetime.strptime(x, dt_format))
    days_spent = days_spent.apply(lambda x: timedelta(x))

    processed_out_datetime = in_datetime + days_spent
    processed_out_datetime = processed_out_datetime.apply(lambda x: x.strftime(dt_format))

    df['outcome_datetime'] = processed_out_datetime

    ## getting dates and min values from datetime data
    int_datetime = df['intake_datetime'].values
    out_datetime = df['outcome_datetime'].values
    int_date = [int(x[8:10]) for x in int_datetime]
    out_date = [int(x[8:10]) for x in out_datetime]

    if name=='train':
        int_min = [int(x[-5:-3]) for x in int_datetime]
        out_min = [int(x[-5:-3]) for x in out_datetime]
    elif name=='test':
        int_min = [int(x[-2:]) for x in int_datetime]
        out_min = [int(x[-2:]) for x in out_datetime]

    df['intake_date'] = int_date
    df['outcome_date'] = out_date
    df['intake_min'] = int_min
    df['outcome_min'] = out_min

    ## getting dates from DOB values
    dob = df['date_of_birth'].values
    dob_date = [int(x[8:10]) for x in int_datetime]

    df['dob_date'] = dob_date

    ## drop redundant columns
    drop_cols = ['intake_datetime',
        'outcome_datetime',
        'date_of_birth',
        'intake_monthyear',
        'outcome_monthyear',
        'time_in_shelter',
        'count',
        'age_upon_intake',
        'age_upon_outcome', 
        'outcome_number']

    df.drop(drop_cols, axis=1, inplace=True)
    df.dropna(inplace=True)

    return df

In [3]:
def encoding(train_df, test_df):

    """
    encoding the preprocessed train and test data. The encoding 
    is specific to this dataset and the function described 
    below can't be directly applied to another dataset.

    input: 
        train_df: train dataframe after it has passed through 
            the preprocessing function
        test_df: test dataframe after it has passed through 
            the preprocessing function

    returns:
        final_train: train dataframe after encoding
        final_test: test dataframe after encoding
        encoding: a dictionary of target value encoding
    """
    
    ## encoding target values
    le = LabelEncoder()
    train_df['outcome_type'] = le.fit_transform(train_df['outcome_type'])

    train_df.rename(columns={'outcome_type': 'Label'}, inplace=True)
    train_y = train_df['Label']

    train_df.drop('Label', axis=1, inplace=True)

    ## get encoding dictionary
    keys = le.classes_
    vals = le.transform(le.classes_)
    encoding = dict(zip(keys, vals))

    ## combine train test dataframes to avoid labelling discrepancies
    train_df['name'] = 'train'
    test_df['name'] = 'test'
    combined_df = pd.concat([train_df, test_df])

    to_encode_cols = ['animal_type',
        'breed',
        'color',
        'intake_condition',
        'intake_type',
        'sex_upon_intake',
        'age_upon_intake_age_group',
        'intake_weekday',
        'sex_upon_outcome',
        'age_upon_outcome_age_group',
        'outcome_weekday']

    ## encode columns using label encoder
    for x in to_encode_cols:
        combined_df[x] = le.fit_transform(combined_df[x])

    final_train = combined_df[combined_df.name == 'train']
    final_test = combined_df[combined_df.name == 'test']

    final_train.drop('name', axis=1, inplace=True)
    final_test.drop('name', axis=1, inplace=True)

    final_train['Label'] = train_y

    return final_train, final_test, encoding

In [4]:
# read train and test data
train = pd.read_csv('../dataset/train.csv')
test = pd.read_csv('../dataset/test.csv')

# preprocess train and test data
train_df = preprocessing(train, name='train')
test_df = preprocessing(test, name='test')

# encode train and test dataframes
train_data, test_data, encoding = encoding(train_df, test_df)

# save feature engineered dataframes to csv files
train_data.to_csv('../processed/train_data.csv')
test_data.to_csv('../processed/test_data.csv')

# pickle the target encoding dictionary
with open('../logs/encoding.pkl', 'wb') as f:
    pickle.dump(encoding, f)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
