In [1]:
import numpy as np
import pandas as pd

In this notebook lets create some scripts to automate reading data and creating the data for the model training stage

1. To read the transaction(train/test) data set and merge with the properties data set
2. To remove the duplicate parcelid's in the train 
3. Create new features


You will find them in 

In [3]:
def read_data(data_string):
    """
    Read the train/properties data set
    
    Keyword Arguments:
    type -- "train_2016",  "train_2017", "properties_2016", "properties_2017"
    
    
    Returns:
    df - dataframe from the "data/raw" folder
    
    
    """
    df = pd.read_csv("../data/raw/{0}.csv".format())
    
    return df

In [None]:
def get_data(data_string):
    """
    Read the train/test dataset and merge with properties data set and remove duplicate parcelid's in train
    
    Keyword Arguments:
    data_string -- "train" or "test" 
    
    Returns:
    
    
    """     
    
    year = 2016 if data_string == "train" else 2017
        
    train = read_data("train_{0}".format(year)
    properties = read_data("properties_{0}".format(year))
    merged = pd.merge(train, properties, on="parcelid", how="left")
                      
    if data_string == "train":
        merged = remove_duplicate_parcels(merged)
                      
    y = merged_df["logerror"]                          
    merged = merged.drop("logerror")     
    
    return merged, y

In [2]:
def remove_duplicate_parcels(df):    
    """
    Remove the records with duplicate parcelid in the merged train data set.
    
    Keyword Arguments:
    df -- merged data frame
    
    Returns:    
    unique_df -- a dataframe with unique parcelid's
    
    """
    
    parcel_count = df.groupby(["parcelid"]).size()
    unique_parcel = df[df["parcelid"].isin(parcel_count[parcel_count == 1].index)]
    duplicated_parcel = df[df["parcelid"].isin(parcel_count[parcel_count > 1].index)]
    duplicated_parcel_unique = duplicated_parcel.sample(frac=1, random_state=42).groupby(["parcelid"]).head(1)
    unique_df = pd.concat([unique_parcel, duplicated_parcel_unique], axis=0)
    
    return unique_df
