# Julia Assistent Tools for Machine Learning

**Author: [Oliveira, D. M.](http://br.linkedin.com/in/dmztheone)**

In [158]:
render_section("", "", "Introduction")

Write a proper introduction...

In [155]:
render_section("01", "", "Import Packages")

In [188]:
using DataFrames
using Iterators
using MLBase
using GLM
using XGBoost

In [151]:
render_section("02", "", "HTML Assistent Tools")

### CSS Style

In [153]:
style_css = """
<style>
span.tt {
    font-family: 'Lucida Sans Typewriter', 'Lucida Console', 
                  monaco, 'Bitstream Vera Sans Mono', monospace;
    color: #03396c;
}

div.method {
    margin-bottom: 10px;
}

div.method_title {
    padding: 4px 0px 4px 10px;
    background-color:#85bdde; color:#005b96;
    border-left: 8px solid #cb2c31;
}

div.method_body {
    padding: 5px 20px 20px 20px;
    border-left: 8px solid #e69598;
    background-color: #ccebfb;
}

div.method_cell {}

div.method_cell_title {
    padding: 20px 0px 0px 0px;
}

div.method_cell_body {}


h1.section {
    border-bottom:5px solid #696565;
}

span.section_number {
    background-color:#696565;
    color:#ffb400;
    padding:3px 5px 3px 5px;
}

span.section_title {
    padding:5px 5px 0px 5px;
    color:#ff9d23;
}

</style>
"""
display("text/html", style_css)

### HTML Tools

In [148]:
function render_doc(method_signature, description, arguments, outputs)
    arguments_html       = field_list_to_html(arguments)
    outputs_html         = field_list_to_html(outputs)
    html_method_template = """
    <div class="method">
        <div class="method_title"><span class="tt">Method: </span><b>$method_signature</b></div>
            <div class="method_body">
                <div class="method_cell">
                    <div class="method_cell_title"><span class="tt"><b>Description</b></span></div>
                    <div class="method_cell_body">
                        <span class="tt">$description.</span>
                    </div>
                </div>
                <div class="method_cell">
                    <div class="method_cell_title"><span class="tt"><b>Arguments</b></span></div>
                    <div class="method_cell_body">
                        <ul>
                            $arguments_html
                        </ul>
                    </div>
                </div>
                <div class="method_cell">
                    <div class="method_cell_title"><span class="tt"><b>Outputs</b></span></div>
                    <div class="method_cell_body">
                        <ul>
                            $outputs_html
                        </ul>
                    </div>
                </div>
            </div>
        </div>
    <div/>
    """
    return display("text/html", html_method_template)
end

function field_list_to_html(list)
    length(list) == 0 && return "<span class=\"tt\">None.</span>"
    html = ""
    for (field_name, field_type, field_description) in list
        html *= "<li><span class=\"tt\">$field_name, <b>$field_type</b>: "
        html *= "$field_description.</span></li>\n"
    end
    return html
end

function render_section(section_number, subsection_number, title)
    return display("text/html", """
        <h1 class="section">
            <span class="section_number">$section_number.$subsection_number</span>
            <span class="section_title">$title</span>
        </h1>
    """)
end

render_section (generic function with 1 method)

In [156]:
render_section("03", "", "Functions: Missing Values")

In [146]:
render_doc("get_default_values (df, features)", 
           """Calculates the mode for all features informed. 
              Usually it is used to replace missing data""", 
           [("df", "DataFrame", "Data used to calculate modes by column"),
            ("features", "Array{Symbol,1}", "Column names that will be considered")],
           [("default_values", "Dict", "Dictionary with the default values")])

render_doc("apply_default_values! (df, default_values)",
           """Apply a dictionary of default values to missing values (NAs) of the
              given dataframe. Only keys identified as column will be used""",
          [("df", "DataFrame", "Data that will be used to replace NAs to default values"),
           ("default_values", "Dict", "Dictionary with features/default values")],
          [("df", "DataFrame", "Original dataframe with NAs replaced by default values")])

In [None]:
function get_default_values(df, features)
    default_values = Dict()
    for feature in features
        default_value = mode(dropna(df[feature]))
        default_values[feature] = default_value
    end
    return default_values
end

function apply_default_values!(df, default_values)
    for feature in keys(default_values)
        df[isna(df[feature]), feature] = default_values[feature]
    end
    return df
end

In [157]:
render_section("04", "", "Functions: Label Encoding")

In [164]:
render_doc("get_label_encoding (df, features)",
           "Get label encoding for features of a given dataframe",
           [("df", "DataFrame", "Data that will be used to create label encoding"), 
            ("features", "Array{Symbol,1}", "Features that will be created label encoding")],
            [("_", "Dict", "A dictionary with the encoding for each feature informed")])

render_doc("apply_encoding! (df, encoding)",
           "Apply encoding to a given dataframe",
           [("df", "DataFrame", "Dataframe that will be encoded"),
            ("encoding", "Dict", "A dictionary with the feature/encoding data")],
           [("df", "DataFrame", "Dataframe with encoded columns")])

In [161]:
get_label_encoding(df, features) = Dict([f => labelmap(dropna(df[f])) for f in features])

function apply_encoding!(df, encoding)
    for feature in keys(encoding)
        df[feature] = labelencode(encoding[feature], df[feature])
    end
    return df
end

apply_encoding! (generic function with 1 method)

In [165]:
render_section("05", "", "Functions: One-Hot-Encoding")

In [172]:
render_doc("get_all_values (df, features)",
           "Get all unique values from a given dataframe",
           [("df", "DataFrame", "Dataframe used to extract unique values"),
            ("features", "Array{Symbol,1}", "Features that will be extract unique values")],
           [("_", "Dict", "A dictionary with unique values for each feature informed")])

render_doc("apply_one_hot_encoding! (df, features, all_values)",
           """Transform for each informed feature in N binary features
              where is equal 1 when found the value in the new binary vector.
              * The original columns will be deleted. See more about this
              <a src="http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html">
              subject</a>""",
          [("df", "DataFrame", "Dataframe used to expand with one hot encoding"),
           ("features", "Array{Symbol,1}", "Features to expand with one hot encoding")],
          [("df", "DataFrame", "Dataframe with columns expanded by one hot encoding")])

In [166]:
get_all_values(df, features) = Dict([f => Set(dropna(df[f])) for f in features])

function apply_one_hot_encoding!(df, features, all_values)
    feature_names = get_categorical_features_names(features, all_values)
    categorical_feature_matrix = get_categorical_feature_matrix(df, features, all_values)
    apply_categorical_feature_matrix!(df, feature_names, categorical_feature_matrix)
    remove_features!(df, features)
    return df
end

function get_categorical_features_names(features, all_values)
    feature_names = []
    for feature in features
        for value in all_values[feature]
            push!(feature_names, symbol(feature, "_", value))
        end
    end
    return feature_names
end

function get_categorical_feature_matrix(df, features, all_values)
    new_feature_matrix = []
    nrows = size(df, 1)
    for i=1:nrows, f=1:length(features)
        i % 1000 == 0 && f == 1 &&  println("\tProcessed Categorical Feature $i/$nrows.")
        push!(new_feature_matrix, 
              to_cat_vector(all_values[features[f]], df[i, features[f]])
        )
    end
    new_feature_matrix = vcat(new_feature_matrix...)
    ncols = sum([length(all_values[feature]) for feature in features])
    new_feature_matrix = reshape(new_feature_matrix, ncols, nrows)'
    return new_feature_matrix
end

function apply_categorical_feature_matrix!(df, feature_names, feature_matrix)
    nfeatures = length(feature_names)
    for i=1:nfeatures
        df[feature_names[i]] = feature_matrix[:, i]
    end
    return df
end

function remove_features!(df, feature_names)
    for feature in feature_names
        delete!(df, feature)
    end
    return df
end

to_cat_vector(all_values, actual_value) = [Int(value == actual_value) for value in all_values]

get_all_values (generic function with 1 method)

In [209]:
render_section("07", "", "Functions: Extract Date Features")

In [211]:
render_doc("extract_date_features (string_date)",
           "Create feature vector from a raw string date",
           [("string_date", "AbstractString", "A string date in format YYYY-MM-DD")],
          [("_", "Array{Int,1}", "Feature vector of date")])

In [None]:
function extract_date_features(string_date)
    
    date = Date(string_date)
    
    dayofweek       = Dates.dayofweek(date)    
    is_sunday       = Dates.Sunday == dayofweek? 1 : 0
    is_monday       = Dates.Monday == dayofweek? 1 : 0
    is_tuesday      = Dates.Tuesday == dayofweek? 1 : 0
    is_wednesday    = Dates.Wednesday == dayofweek? 1 : 0
    is_thursday     = Dates.Thursday == dayofweek? 1 : 0
    is_friday       = Dates.Friday == dayofweek? 1 : 0
    is_saturday     = Dates.Saturday == dayofweek? 1 : 0
    is_weekday      = Dates.Monday <= dayofweek <= Dates.Friday? 1 : 0
    is_weekend      = Dates.Saturday <= dayofweek <= Dates.Sunday? 1 : 0
    is_middle_week  = Dates.Tuesday <= dayofweek <= Dates.Thursday? 1 : 0
    
    month = Dates.month(date)
    is_january               = Dates.January == month? 1 : 0
    is_february              = Dates.February == month? 1 : 0
    is_march                 = Dates.March == month? 1 : 0
    is_april                 = Dates.April == month? 1 : 0
    is_may                   = Dates.May == month? 1 : 0
    is_june                  = Dates.June == month? 1 : 0
    is_july                  = Dates.July == month? 1 : 0
    is_august                = Dates.August == month? 1 : 0
    is_september             = Dates.September == month? 1 : 0
    is_october               = Dates.October == month? 1 : 0
    is_november              = Dates.November == month? 1 : 0
    is_december              = Dates.December == month? 1 : 0
    is_first_quarter_year    = month < 4? 1 : 0
    is_second_quarter_year   = 4 <= month < 7? 1 : 0
    is_third_quarter_year    = 7 <= month < 10? 1 : 0
    is_forth_quarter_year    = month >= 10? 1 : 0
    is_first_trimester_year  = month <= 4? 1 : 0
    is_second_trimester_year = 4 < month <= 8? 1 : 0
    is_third_trimester_year  = month > 8? 1 : 0
    is_first_half_year       = month <= 6? 1 : 0
    is_second_half_year      = month > 6? 1 : 0
    
    return [is_sunday, is_monday, is_tuesday, is_wednesday, is_thursday, 
            is_friday, is_saturday, is_weekday, is_weekend, is_middle_week,
            is_january, is_february, is_march, is_april, is_may, is_june,
            is_july, is_august, is_september, is_october, is_november, 
            is_december, is_first_quarter_year, is_second_quarter_year, 
            is_third_quarter_year, is_forth_quarter_year, is_first_trimester_year, 
            is_second_trimester_year, is_third_trimester_year, is_first_half_year, 
            is_second_half_year]
end

In [208]:
render_section("07", "", "Functions: Export Files")

In [177]:
render_doc("export_csv (df, file_path)",
           "Export dataframe to CSV format",
           [("df", "DataFrame", "Dataframe to be exported"),
            ("file_path", "AbstractString", "File path to save CSV file")], [])

render_doc("export_tsv (df, file_path)",
           "Export dataframe to TSV format",
           [("df", "DataFrame", "Dataframe to be exported"),
            ("file_path", "AbstractString", "File path to save TSV file")], [])

In [173]:
export_csv(df, file_path) = writetable(file_path, df)
export_tsv(df, file_path) = writetable(file_path, df, separator="tsv")

export_tsv (generic function with 1 method)

In [207]:
render_section("08", "", "Functions: Split Training and Validation")

In [183]:
render_doc("split_train_val (df; train_size=.85, random_state=1)",
           "Separate in train and validation dataframes",
           [("df", "DataFrame", "Original dataframe that will be splitted"),
            ("train_size", "Float", """Size in porcentage of the training dataframe.
              Default value = 0.85"""),
            ("random_state", "Int", "Random seed used. Default value = 1")],
           [("train", "DataFrame", "Train dataframe with train_size percentage"),
            ("validation", "DataFrame", "Validation dataframe with 1-train_size percentage")])

render_doc("gen_train_val (train, features, label, train_size=.85, random_state=1)",
           "Generate train and validation for x and y",
           [("train", "DataFrame", "Train dataframe that will be splitted"),
            ("features", "Array{Symbol,1}", "Columns to consider as X (features)"),
            ("label", "Symbol", "Column to consider as Y (output)"),
            ("train_size", "Float", """Size in porcentage of the training dataframe.
              Default value = 0.85"""),
            ("random_state", "Int", "Random seed used. Default value = 1")],
           [("train_x", "Array{Float,2}", "Train matrix with features"),
            ("train_y", "Array{Float,1}", "Train array with outputs"),
            ("val_x", "Array{Float,2}", "Validation dataframe with features"),
            ("val_y", "Array{Float,1}", "Validation array with outputs")])

In [181]:
function split_train_val(df; train_size=.85, random_state=1)
    srand(random_state)
    
    nrows, ntraining_rows = size(df, 1), round(Int, size(df, 1) * train_size)
    indexes               = shuffle(collect(1:nrows))
    train                 = df[indexes[1:ntraining_rows], :]
    validation            = df[indexes[ntraining_rows+1:end], :]
    
    return train, validation
end

function gen_train_val(train, features, label, train_size=.85, random_state=1)
    X_train, X_val = split_train_val(train, train_size=.85, random_state=1)
    train_x = Array{Float64,2}(X_train[:, features])
    train_y = Array{Float64,1}(X_train[label]) - 1
    val_x   = Array{Float64,2}(X_val[:, features])
    val_y   = Array{Float64,1}(X_val[label]) - 1
    return train_x, train_y, val_x, val_y
end

gen_train_val (generic function with 3 methods)

In [205]:
render_section("09", "", "Functions: XGB Auxiliary Tools")

In [186]:
render_doc("gen_dtrain(train, features, label, train_size=.85, random_state=1)",
           "Generate DMatrices for train and validation to use on XGBoost",
           [("train", "DataFrame", "Train dataframe that will be splitted"),
            ("features", "Array{Symbol,1}", "Columns to consider as X (features)"),
            ("label", "Symbol", "Column to consider as Y (output)"),
            ("train_size", "Float", """Size in porcentage of the training dataframe.
              Default value = 0.85"""),
            ("random_state", "Int", "Random seed used. Default value = 1")],
           [("dtrain", "DMatrix", "Train matrix to XGB"),
            ("dval", "DMatrix", "Validation matrix to XGB")])

In [None]:
function gen_dtrain(train, features, label, train_size=.85, random_state=1)
    train_x, train_y, val_x, val_y = gen_train_val(train, features, label, 
                                                   train_size, random_state)
    dtrain  = DMatrix(train_x, label=train_y)
    dval    = DMatrix(val_x, label=val_y)
    return dtrain, dval
end

In [206]:
render_section("10", "", "Functions: GLM Auxiliary Tools")

In [204]:
render_doc("gen_formulas (features, label)",
"Generate all valids formules between features x label",
          [("features", "Array{Symbol,1}", "Features that will be used in combination"),
           ("label", "Symbol", "Output label (i.e., Y)")],
          [("formulas", "Array{Formula,1}", "Array of formulas to be used in GLM package")])

render_doc("gen_glm (train, formulas, family=Binomial(), link=LogitLink())",
           """Generate Generalized Linear Models (GLMs). Check the 
              <a src="https://github.com/JuliaStats/GLM.jl">original package</a>
              for more information""",
          [("train", "DataFrame", "Train data"),
           ("formulas", "Array{Formula,1}", "Array with the formulas"),
           ("family", "Distribution", "Family discribution (e.g., Binomial, Gamma, etc)"),
           ("link", "", "Link from distribution. Default is LogitLink for logistic regression")],
          [("_", "GLM Models", "Array with GLM models")])

In [199]:
function gen_formulas(features, label)
    all_features_combinations = collect(subsets(features))
    formulas = []
    for features_combination in all_features_combinations[2:end]
        formula = eval(parse(string(label, "~", join(features_combination, "+"))))
        push!(formulas , formula)
    end
    return formulas
end

function gen_glm(train, formulas, family=Binomial(), link=LogitLink())
    return [glm(formula, train, family, link) for formula in formulas]
end

gen_glm (generic function with 3 methods)