In [1]:
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import math

In [None]:
from datetime import datetime as dt
from datetime import timedelta as td
from dateutil import rrule
from datetime import date

In [None]:
import matplotlib.pylab as plt
%matplotlib inline

In [None]:
# For local ML
from sklearn.cross_validation import train_test_split

from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm

from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

In [None]:
# For distributed computing (with Spark)
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
#from pyspark.sql.functions import udf
from pyspark.sql.types import *

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.rdd import RDD
from pyspark.files import SparkFiles
from pyspark.storagelevel import StorageLevel
from pyspark.accumulators import Accumulator, AccumulatorParam
from pyspark.broadcast import Broadcast
from pyspark.serializers import MarshalSerializer, PickleSerializer
from pyspark.status import *
from pyspark.profiler import Profiler, BasicProfiler

from pyspark.sql import HiveContext
from pyspark.sql import Window
from pyspark.sql.functions import rank, min

In [None]:
def load_data(path, file_name, nrows=None, verbose=True):
    """
    convenience func for printing
    side effects
    :param path:
    :param file_name:
    :param nrows:
    :return:
            (pandas.dataframe)
    """
    data_path = os.path.join(path, file_name)
    if verbose:
        print('\n#################################')
        print('Loading data from {0}...'.format(data_path))
    data = pd.read_csv(data_path, nrows=nrows)
    if verbose:
        print('Dataset num rows: {0}, num cols: {1}'
              .format(data.shape[0],data.shape[1]))
        print('Columns: {}'.format(list(data.columns.values)))
        print('Head: ')
        print(data.head())
    return data

In [None]:
DATA = '/mnt/tests'

train_path = os.path.join(DATA, 'train.csv')
test_path = os.path.join(DATA, 'test.csv')
client_data_path = os.path.join(DATA, 'cliente_tabla.csv')
product_data_path = os.path.join(DATA, 'producto_tabla.csv')
town_state_path = os.path.join(DATA, 'town_state.csv')

print('Loading data..')
df_train = load_data(path=DATA, file_name='train.csv', nrows=10**7)
df_client = load_data(path=DATA, file_name='cliente_tabla.csv')
df_prod = load_data(path=DATA, file_name='producto_tabla.csv')
df_town = load_data(path=DATA, file_name='town_state.csv')
df_test = load_data(path=DATA, file_name='test.csv')
df_test = df_test.drop(['id'], axis=1)

target = 'Demanda_uni_equil'
indep_vars = list(df_test.columns.values)
print(indep_vars)
y = df_train['Demanda_uni_equil']
X = df_train[indep_vars]

In [None]:
def train_gbm(X_train,  y_train, grid_search=False, verbose=True,
              min_samples_split=None, min_samples_leaf=50, max_depth=8,
              max_features='sqrt', sub_sample=0.8, n_estimators=100,
              learning_rate=0.1, random_state=10, param_grid=None,
              ):
    """

    :param X_train:
    :param indep_vars:
    :param dep_var:
    :param verbose:
    :param min_samples_leaf: (int) prevent overfitting, intuition based value..
    :param max_depth: (int) 8 # 5 -8, based on number of features and dataset size
    :param max_features: (str) 'sqrt' # general rule of thumb: sqrt(n_samples)
    :param sub_sample: (float) fraction of observations to be selected for each tree (0.8 commonly used value)
    :param n_estimators: (int) number of sequential trees to be modeled
    :param learning_rate: (float)
    :param random_state: (int)
    :param param_grid: (dict)
    :return:
            (model)
    """
    n_samples = X_train.shape[0]
    if not min_samples_split:
        min_samples_split = n_samples * .01 # prevent overfitting, general rule of thumb: 0.5 - 1%

    gbm = GradientBoostingClassifier(
        min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
        max_depth=max_depth, max_features=max_features, learning_rate=learning_rate,
        n_estimators=n_estimators, subsample=sub_sample, random_state=random_state)
    model = gbm
    if grid_search:
        if not param_grid:
            param_grid = {'n_estimators':range(20,81,10)}
        model = GridSearchCV(
            estimator=gbm,
            param_grid=param_grid,
            scoring='roc_auc',
            n_jobs=4,
            iid=False,
            cv=5)
    model.fit(X_train, y_train)
    if grid_search and verbose:
        print('Model Grid scores: {0}, best params: {1}, best score: {2}'
              .format(model.grid_scores_, model.best_params_, model.best_score_))
    return model

In [None]:
def gbm_predict(X_train, y_train, X_test, y_test, indep_vars, grid_search=True,
                cv=True, verbose=True, cv_folds=5, scoring='roc_auc'):

    # Train
    model = train_gbm(X_train,  y_train, grid_search=grid_search, verbose=verbose)
    # predict on train
    train_pred = model.predict(X_test)
    # probability
    train_pred_prob = model.predict_prob(X_test)[:, 1]

    # Cross-validation
    if cv:
        cv_score = cross_validation.cross_val_score(model, X_test,
                                                    y_test, cv=cv_folds,
                                                    scoring=scoring)
    if verbose:
        print("\nModel Report")
        print("Accuracy : %.4g" % metrics.accuracy_score(X_test.values, train_pred))
        print("AUC Score (Train): %f" % metrics.roc_auc_score(X_test, train_pred_prob))
        if cv:
            print("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g"
                  % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
        # feature relevance
        predictive_relavance = pd.Series(model.feature_importances_, indep_vars).sort_values(ascending=False)
        predictive_relavance.plot(kind='bar', title='Feature relevance')
        plt.ylabel('Feature relevance score')
    return model

# Loading the big guns - enter spark

In [None]:
#initialize a configuration object ..
MY_NAME='diogo' # change this
conf = SparkConf().setMaster("yarn-client").setAppName("bimbo - {}".format(MY_NAME)) \
.set("spark.driver.memory", "2g").set("spark.executor.memory", "2g") \
.set("spark.executor.instances", "2").set("spark.dynamicAllocation.enabled", "true")

In [None]:
# .. and pass it to a new SparkContext 
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
#initialize hiveContext
#sqlContext = HiveContext(sc)

In [None]:
s3_train = 'mnt/tests/train.csv'
s3_test = 'mnt/tests/test.csv'
s3_client = 'mnt/tests/cliente_tabla.csv'
s3_prod = 'mnt/tests/producto_tabla.csv'
s3_town = 'mnt/tests/town_state.csv'

In [None]:
print('Loading datasets from hdfs')
print('Loading train set...')
df_train = sqlContext.read \
    .format("com.databricks.spark.csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load(s3_train)
#df_train.select("*").write.save("{}.parquet".format("train_bimbo"), format="parquet")
print('finished loading')
df_test.printSchema()

In [None]:
print('Loading test set...')
df_test = sqlContext.read \
    .format("com.databricks.spark.csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load(s3_test)
#df_test.select("*").write.save("{}.parquet".format("test_bimbo"), format="parquet")
print('finished loading')
df_test.printSchema()

In [None]:
print('Loading client set...')
df_client = sqlContext.read \
    .format("com.databricks.spark.csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load(s3_client)
#df_client.select("*").write.save("{}.parquet".format("client_bimbo"), format="parquet")
print('finished loading')
df_client.printSchema()

In [None]:
df_prod = sqlContext.read \
    .format("com.databricks.spark.csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load(s3_prod)   
#df_prod.select("*").write.save("{}.parquet".format("prod_bimbo"), format="parquet")
print('finished loading')
df_prod.printSchema()

In [None]:
df_town = sqlContext.read \
    .format("com.databricks.spark.csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load(s3_town)
#df_town.select("*").write.save("{}.parquet".format("town_bimbo"), format="parquet")
print('finished loading')
df_town.printSchema()

In [None]:
print('Showing head')
df_train.printSchema()
df_train.show(6)

In [3]:
print('Loading xgboost package..')

Loading xgboost package..


In [9]:
# make sure you setup your env correctly: https://github.com/dmlc/xgboost/blob/master/doc/build.md#python-package-installation
import xgboost as xgb

In [3]:
curr_dir = os.path.curdir
print('Current directory is: {0}'.format(curr_dir))
data_path = os.path.join(curr_dir, 'data')
train_path = os.path.join(data_path, 'train.csv')
test_path = os.path.join(data_path, 'test.csv')
client_data_path = os.path.join(data_path, 'cliente_tabla.csv')
product_data_path = os.path.join(data_path, 'producto_tabla.csv')
town_state_path = os.path.join(data_path, 'town_state.csv')
print('Reading data from {0}...'.format(curr_dir))

In [None]:
print('Reading data from {0}...'.format(curr_dir))
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_client = pd.read_csv(client_data_path)
df_prod = pd.read_csv(product_data_path)
print('Finished loading data.')
print(df_train.shape)