# MLOps HW2 Gradient Boosting Model

In [0]:
import sys
import dataiku
import numpy as np
import pandas as pd
import sklearn as sk
import dataiku.core.pandasutils as pdu
from dataiku.doctor.preprocessing import PCA
from collections import defaultdict, Counter

In [0]:
pd.set_option('display.width', 3000)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

#### Importing base data

In [0]:
# We apply the preparation that you defined. You should not modify this.
preparation_steps = [{'type': 'RoundProcessor', 'params': {'mode': 'ROUND', 'places': 0, 'precision': 0, 'appliesTo': 'SINGLE_COLUMN', 'columns': ['Chance of Admit']}, 'metaType': 'PROCESSOR', 'preview': True, 'disabled': False, 'alwaysShowComment': False}]
preparation_output_schema = {'columns': [{'name': 'Serial No.', 'type': 'bigint'}, {'name': 'GRE Score', 'type': 'bigint'}, {'name': 'TOEFL Score', 'type': 'bigint'}, {'name': 'University Rating', 'type': 'bigint'}, {'name': 'CGPA', 'type': 'double'}, {'name': 'Research', 'type': 'bigint'}, {'name': 'Chance of Admit', 'type': 'bigint'}, {'name': 'SOP', 'type': 'double'}, {'name': 'LOR', 'type': 'double'}], 'userModified': False}

ml_dataset_handle = dataiku.Dataset('US_graduate_schools_admission_parameters_dataset_joined')
ml_dataset_handle.set_preparation_steps(preparation_steps, preparation_output_schema)
%time ml_dataset = ml_dataset_handle.get_dataframe(limit = 100000)

print ('Base data has %i rows and %i columns' % (ml_dataset.shape[0], ml_dataset.shape[1]))
# Five first records",
ml_dataset.head(5)

#### Initial data management

In [0]:
ml_dataset = ml_dataset[['SOP', 'University Rating', 'GRE Score', 'CGPA', 'Research', 'Chance of Admit', 'LOR', 'TOEFL Score']]

In [0]:
# astype('unicode') does not work as expected

def coerce_to_unicode(x):
    if sys.version_info < (3, 0):
        if isinstance(x, str):
            return unicode(x,'utf-8')
        else:
            return unicode(x)
    else:
        return str(x)


categorical_features = []
numerical_features = ['SOP', 'University Rating', 'GRE Score', 'CGPA', 'Research', 'LOR', 'TOEFL Score']
text_features = []
from dataiku.doctor.utils import datetime_to_epoch
for feature in categorical_features:
    ml_dataset[feature] = ml_dataset[feature].apply(coerce_to_unicode)
for feature in text_features:
    ml_dataset[feature] = ml_dataset[feature].apply(coerce_to_unicode)
for feature in numerical_features:
    if ml_dataset[feature].dtype == np.dtype('M8[ns]') or (hasattr(ml_dataset[feature].dtype, 'base') and ml_dataset[feature].dtype.base == np.dtype('M8[ns]')):
        ml_dataset[feature] = datetime_to_epoch(ml_dataset[feature])
    else:
        ml_dataset[feature] = ml_dataset[feature].astype('double')

In [0]:
target_map = {'0': 0, '1': 1}
ml_dataset['__target__'] = ml_dataset['Chance of Admit'].map(str).map(target_map)
del ml_dataset['Chance of Admit']


# Remove rows for which the target is unknown.
ml_dataset = ml_dataset[~ml_dataset['__target__'].isnull()]

ml_dataset['__target__'] = ml_dataset['__target__'].astype(np.int64)

#### Cross-validation strategy

In [0]:
train, test = pdu.split_train_valid(ml_dataset, prop=0.8)
print ('Train data has %i rows and %i columns' % (train.shape[0], train.shape[1]))
print ('Test data has %i rows and %i columns' % (test.shape[0], test.shape[1]))

#### Features preprocessing

In [0]:
drop_rows_when_missing = []
impute_when_missing = [{'feature': 'SOP', 'impute_with': 'MEAN'}, {'feature': 'University Rating', 'impute_with': 'MEAN'}, {'feature': 'GRE Score', 'impute_with': 'MEAN'}, {'feature': 'CGPA', 'impute_with': 'MEAN'}, {'feature': 'Research', 'impute_with': 'MEAN'}, {'feature': 'LOR', 'impute_with': 'MEAN'}, {'feature': 'TOEFL Score', 'impute_with': 'MEAN'}]

# Features for which we drop rows with missing values"
for feature in drop_rows_when_missing:
    train = train[train[feature].notnull()]
    test = test[test[feature].notnull()]
    print ('Dropped missing records in %s' % feature)

# Features for which we impute missing values"
for feature in impute_when_missing:
    if feature['impute_with'] == 'MEAN':
        v = train[feature['feature']].mean()
    elif feature['impute_with'] == 'MEDIAN':
        v = train[feature['feature']].median()
    elif feature['impute_with'] == 'CREATE_CATEGORY':
        v = 'NULL_CATEGORY'
    elif feature['impute_with'] == 'MODE':
        v = train[feature['feature']].value_counts().index[0]
    elif feature['impute_with'] == 'CONSTANT':
        v = feature['value']
    train[feature['feature']] = train[feature['feature']].fillna(v)
    test[feature['feature']] = test[feature['feature']].fillna(v)
    print ('Imputed missing values in feature %s with value %s' % (feature['feature'], coerce_to_unicode(v)))

In [0]:
rescale_features = {'SOP': 'AVGSTD', 'University Rating': 'AVGSTD', 'GRE Score': 'AVGSTD', 'CGPA': 'AVGSTD', 'Research': 'AVGSTD', 'LOR': 'AVGSTD', 'TOEFL Score': 'AVGSTD'}
for (feature_name, rescale_method) in rescale_features.items():
    if rescale_method == 'MINMAX':
        _min = train[feature_name].min()
        _max = train[feature_name].max()
        scale = _max - _min
        shift = _min
    else:
        shift = train[feature_name].mean()
        scale = train[feature_name].std()
    if scale == 0.:
        del train[feature_name]
        del test[feature_name]
        print ('Feature %s was dropped because it has no variance' % feature_name)
    else:
        print ('Rescaled %s' % feature_name)
        train[feature_name] = (train[feature_name] - shift).astype(np.float64) / scale
        test[feature_name] = (test[feature_name] - shift).astype(np.float64) / scale

#### Modeling

In [0]:
X_train = train.drop('__target__', axis=1)
X_test = test.drop('__target__', axis=1)

y_train = np.array(train['__target__'])
y_test = np.array(test['__target__'])

In [0]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(
                    random_state = 1337,
                    verbose = 0,
                    n_estimators = 20,
                    learning_rate = 0.36666667,
                    loss = 'deviance',
                    max_depth = 2
                   )

In [0]:
clf.class_weight = "balanced"

In [0]:
%time clf.fit(X_train, y_train)

In [0]:
%time _predictions = clf.predict(X_test)
%time _probas = clf.predict_proba(X_test)
predictions = pd.Series(data=_predictions, index=X_test.index, name='predicted_value')
cols = [
    u'probability_of_value_%s' % label
    for (_, label) in sorted([(int(target_map[label]), label) for label in target_map])
]
probabilities = pd.DataFrame(data=_probas, index=X_test.index, columns=cols)

# Build scored dataset
results_test = X_test.join(predictions, how='left')
results_test = results_test.join(probabilities, how='left')
results_test = results_test.join(test['__target__'], how='left')
results_test = results_test.rename(columns= {'__target__': 'Chance of Admit'})

#### Results

In [0]:
from dataiku.doctor.utils.metrics import mroc_auc_score
y_test_ser = pd.Series(y_test)
 
print ('AUC value:', mroc_auc_score(y_test_ser, _probas))

In [0]:
inv_map = { target_map[label] : label for label in target_map}
predictions.map(inv_map)