## DIEGO CUARTAS
Master in Business Analytics & Big Data - Risk & Fraud Analytics

### Importing Libraries and Datasets

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import random
import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from requests.auth import HTTPBasicAuth
from IPython.display import clear_output
from deap import creator, base, tools, algorithms
from sklearn.impute import SimpleImputer

### Data Loading

In [2]:
df = pd.read_csv("dev.csv") 
dfo = pd.read_csv("oot0.csv")

### Data Preparation

#### OOT Censoring

In [3]:
def is_min_max_censoring(value,dev_min, dev_max):
    try:
        if value < dev_min:  
            return dev_min
        if value > dev_max: 
            return dev_max
    except:
        value
    return value

for col in [i for i in list(df.columns) if i not in ['ob_target','id']]:  
    dev_min = df[col].min()
    dev_max = df[col].max()
    dfo[col] = np.vectorize(is_min_max_censoring)(dfo[col],dev_min,dev_max)

####  Dummy: Categorical Nominal

In [4]:
for varname in ['icn_var_22','icn_var_23','icn_var_24']:
    df_temp = pd.get_dummies(df[varname])
    dfo_temp = pd.get_dummies(dfo[varname])
    new_dummy_names = []
    for colname in df_temp.columns:
        new_dummy_names.append(str(varname) + '_'+str(colname))
    df_temp.columns = new_dummy_names
    dfo_temp.columns = new_dummy_names
    df = pd.concat([df,df_temp],axis=1)
    dfo = pd.concat([dfo,dfo_temp],axis=1) 

in_model = [i for i in list(df.columns) if i not in ['ob_target','id']]
output_var = 'ob_target'

#### OOT NaN Values - Mean Imputer

In [10]:
X = df[in_model]
y = df[output_var]
Xo = dfo[in_model]

# Imputing NaN values OOT
imp = SimpleImputer(strategy='mean')
imp.fit(Xo)
Xo = imp.transform(Xo)

### Modeling

#### Baseline RFC

In [11]:
RFC = RandomForestClassifier(random_state=25)
fitted_model = RFC.fit(X, y)
pred_dev = fitted_model.predict_proba(X)[:,1]
pred_oot  = fitted_model.predict_proba(Xo)[:,1]

#### Parameters Optimization for RFC

In [12]:
def Grid_Search(X, y, Xo):
    iteration = 0
    url = 'http://mfalonso.pythonanywhere.com/api/v1.0/uploadpredictions'
    Hyperparameters = pd.DataFrame(columns = ['KS_OOT','GINI_OOT','GRADE','param_grid'])
    C1 = []
    C2 = []
    C3 = []
    C4 = []
    RFC = RandomForestClassifier()
    for max_depth in [70, 80, 90, 100]:
        for n_estimators in [5000, 6000, 7000]:
            for max_features in ["log2"]:
                for min_samples_split in [2,4]:
                    for min_samples_leaf in [2]:
                        for bootstrap in [False]:
                
                            clear_output(wait=True)
                            iteration = iteration + 1
                            print("Running Iteration #" + str(iteration))
                            param_grid = {
                                'max_depth'         : max_depth,
                                'n_estimators'      : n_estimators,
                                'max_features'      : max_features,
                                'min_samples_split' : min_samples_split,
                                'min_samples_leaf'  : min_samples_leaf,
                                'bootstrap'         : bootstrap,
                            }
                            RFC = RandomForestClassifier(random_state=25, n_jobs=-1).set_params(**param_grid)
                            RFC.fit(X, y)
                            pred_oot = RFC.predict_proba(Xo)[:,1]

                            # OOT Prediction
                            dfo['pred'] = pred_oot
                            dfo_tosend = dfo[list(['id','pred'])]

                            # Submission
                            i=1
                            filename = "student_sub"+str(i)+".csv"
                            dfo_tosend.to_csv(filename, sep=',')
                            files = {'file': (filename, open(filename, 'rb'))}
                            rsub = requests.post(url, files=files, auth=HTTPBasicAuth('Diego.Cuartas', 'Cuartas3458!'))
                            resp_str = str(rsub.text)

                            # Results
                            KS_GINI_GRADE_OOT = resp_str.replace(' ','').replace('KS2=','').replace ('GINI=','').replace ('RESULTSUBMISSION:','').replace ('GRADE=','').split(';')
                            KS_OOT = float(KS_GINI_GRADE_OOT[0])
                            GINI_OOT = float(KS_GINI_GRADE_OOT[1])
                            GRADE = float(KS_GINI_GRADE_OOT[2])
                            C1.append(KS_OOT)
                            C2.append(GINI_OOT)
                            C3.append(GRADE)
                            C4.append(param_grid)
                            time.sleep(20)

    Hyperparameters['KS_OOT'] = C1
    Hyperparameters['GINI_OOT'] = C2
    Hyperparameters['GRADE'] = C3
    Hyperparameters['param_grid'] = C4
   
    best_parameters = Hyperparameters.sort_values('KS_OOT', ascending = False).head(1)['param_grid']
    return best_parameters, Hyperparameters, RFC

In [13]:
best_parameters, Hyperparameters, model = Grid_Search(X, y, Xo)
pd.set_option('display.max_colwidth', -1)
Hyperparameters.sort_values('GRADE', ascending=False).head(5)

Running Iteration #24


Unnamed: 0,KS_OOT,GINI_OOT,GRADE,param_grid
9,0.38207,0.512858,9.054,"{'max_depth': 80, 'n_estimators': 6000, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': False}"
2,0.38207,0.512858,9.054,"{'max_depth': 70, 'n_estimators': 6000, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': False}"
3,0.38207,0.512858,9.054,"{'max_depth': 70, 'n_estimators': 6000, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': False}"
15,0.38207,0.512858,9.054,"{'max_depth': 90, 'n_estimators': 6000, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': False}"
14,0.38207,0.512858,9.054,"{'max_depth': 90, 'n_estimators': 6000, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': False}"


In [14]:
best_parameters

9    {'max_depth': 80, 'n_estimators': 6000, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': False}
Name: param_grid, dtype: object

In [15]:
best_parameters = {'max_depth': 80, 'n_estimators': 6000, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': False}

In [16]:
Optimized_Model = RandomForestClassifier(random_state=25,n_jobs=-1).set_params(**best_parameters)
Optimized_Model.fit(X,y)
pred_dev = Optimized_Model.predict_proba(X)[:,1]
pred_oot = Optimized_Model.predict_proba(Xo)[:,1]

#### Professor Recomendation: RFC Parameters:

In [17]:
Professor_Model=RandomForestClassifier(n_estimators=10000, oob_score=True, random_state=25 ,max_features='log2', bootstrap = 'True', min_samples_split = 2, min_samples_leaf = 1, criterion = 'entropy', n_jobs=5)
Professor_Model.fit(X,y)
pred_dev = Professor_Model.predict_proba(X)[:,1]
pred_oot = Professor_Model.predict_proba(Xo)[:,1]

### Genetic Algortithm - Feature Selection

In [9]:

#SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION)
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list(in_model)))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def evalOneMax(individual):
    return sum(individual),

toolbox.register("evaluate", evalOneMax)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

NPOPSIZE = 80 
population = toolbox.population(n=NPOPSIZE)


#ASSESSING GINI ON THE STARTING POOL

dic_gini={}
for i in range(np.shape(population)[0]): 

    # TRASLATING DNA INTO LIST OF VARIABLES (1-81)
    var_model = []    
    for j in range(np.shape(population)[0]): 
        if (population[i])[j]==1:
            var_model.append(list(list(in_model))[j])

    # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL 
            
    X_train=df[var_model]
    Y_train=df[output_var]
    Xo = dfo[var_model]
    # Imputing NaN values OOT
    imp = SimpleImputer(strategy='mean')
    imp.fit(Xo)
    Xo = imp.transform(Xo)

    # BEST RANDOM FOREST CLASSIFIER

    RFC = RandomForestClassifier(n_estimators=10000, oob_score=True, random_state= 25 ,max_features='log2', bootstrap = 'True', min_samples_split = 2, min_samples_leaf = 1, criterion = 'entropy', n_jobs=5)
    model = RFC.fit(X_train, Y_train)  
    Y_predict = model.predict_proba(Xo)[:,1]

    # OOT Prediction
    dfo['pred'] = Y_predict
    dfo_tosend = dfo[list(['id','pred'])]

    # Submission
    iteration=1
    filename = "student_sub"+str(iteration)+".csv"
    dfo_tosend.to_csv(filename, sep=',')
    url = 'http://mfalonso.pythonanywhere.com/api/v1.0/uploadpredictions'
    files = {'file': (filename, open(filename, 'rb'))}
    rsub = requests.post(url, files=files, auth=HTTPBasicAuth('Diego.Cuartas', 'Cuartas3458!'))
    resp_str = str(rsub.text)
    time.sleep(20)

    # Results
    KS_GINI_GRADE_OOT = resp_str.replace(' ','').replace('KS2=','').replace ('GINI=','').replace ('RESULTSUBMISSION:','').replace ('GRADE=','').split(';')
    KS_OOT = float(KS_GINI_GRADE_OOT[0])
    GINI_OOT = float(KS_GINI_GRADE_OOT[1])
    GRADE = float(KS_GINI_GRADE_OOT[2])

    if GRADE > 9.8:
        print("Grade: " + str(GRADE) + ", KS: " + str(KS_OOT) + ", GINI: " + str(GINI_OOT))
        print(var_model)

                        
    gini_power = GINI_OOT         
    
    gini=str(gini_power)+";"+str(population[i]).replace('[','').replace(', ','').replace(']','')
    dic_gini[gini]=population[i]   
    
list_gini=sorted(dic_gini.keys(),reverse=True)


#GENETIC ALGORITHM MAIN LOOP - START
# - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES)

sum_current_gini=0.0
sum_current_gini_1=0.0
sum_current_gini_2=0.0
first=0    
OK = 1
a=0
while OK:  #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS
    a=a+1
    print ('loop ', a)
    OK=0

    # GENERATING OFFSPRING - START
    offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10%
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    population =toolbox.select(offspring, k=len(population))
    # GENERATING OFFSPRING - END

    sum_current_gini_2=sum_current_gini_1
    sum_current_gini_1=sum_current_gini
    sum_current_gini=0.0

    #ASSESSING GINI ON THE OFFSPRING - START

    for j in range(np.shape(population)[0]): 
        if population[j] not in dic_gini.values(): 
            var_model = [] 
            for i in range(np.shape(population)[0]): 
                if (population[j])[i]==1:
                    var_model.append(list(list(in_model))[i])
                    
            
            X_train=df[var_model]
            Y_train=df[output_var]
            Xo = dfo[var_model]
            # Imputing NaN values OOT
            imp = SimpleImputer(strategy='mean')
            imp.fit(Xo)
            Xo = imp.transform(Xo)
            
       
            # BEST RANDOM FOREST CLASSIFIER
            RFC = RandomForestClassifier(n_estimators=10000, oob_score=True, random_state=25 ,max_features='log2', bootstrap = 'True', min_samples_split = 2, min_samples_leaf = 1, criterion = 'entropy', n_jobs=5)
            model = RFC.fit(X_train, Y_train)  
            Y_predict = model.predict_proba(Xo)[:,1]

            # OOT Prediction
            dfo['pred'] = Y_predict
            dfo_tosend = dfo[list(['id','pred'])]

            # Submission
            iteration=1
            filename = "student_sub"+str(iteration)+".csv"
            dfo_tosend.to_csv(filename, sep=',')
            url = 'http://mfalonso.pythonanywhere.com/api/v1.0/uploadpredictions'
            files = {'file': (filename, open(filename, 'rb'))}
            rsub = requests.post(url, files=files, auth=HTTPBasicAuth('Diego.Cuartas', 'Cuartas3458!'))
            resp_str = str(rsub.text)
            time.sleep(20)

            # Results
            KS_GINI_GRADE_OOT = resp_str.replace(' ','').replace('KS2=','').replace ('GINI=','').replace ('RESULTSUBMISSION:','').replace ('GRADE=','').split(';')
            KS_OOT = float(KS_GINI_GRADE_OOT[0])
            GINI_OOT = float(KS_GINI_GRADE_OOT[1])
            GRADE = float(KS_GINI_GRADE_OOT[2])

            if GRADE > 9.8:
                print("Grade: " + str(GRADE) + ", KS: " + str(KS_OOT) + ", GINI: " + str(GINI_OOT))
                print(var_model)
          
            gini_power = GINI_OOT  

            gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
            dic_gini[gini]=population[j]  

    #ASSESSING GINI ON THE OFFSPRING - END

    #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START
         
    list_gini=sorted(dic_gini.keys(),reverse=True)
    population=[]
    for i in list_gini[:NPOPSIZE]:
        population.append(dic_gini[i])
        gini=float(i.split(';')[0])
        sum_current_gini+=gini
        
    #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END     
      
    #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS
    print ('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2)
    if(sum_current_gini>sum_current_gini_1+0.0001 or sum_current_gini>sum_current_gini_2+0.0001):
        OK=1

#GENETIC ALGORITHM MAIN LOOP - END

gini_max=list_gini[0]        
gini=float(gini_max.split(';')[0])
features=gini_max.split(';')[1]

# PRINTING OUT THE LIST OF FEATURES
f=0
for i in range(len(features)):
    if features[i]=='1':
        f+=1
        print ('feature ', f, ':', list(list(in_model))[i])
print ('gini: ', gini)


loop  1
sum_current_gini= 39.155230177731006 sum_current_gini_1= 0.0 sum_current_gini_2= 0.0
loop  2
sum_current_gini= 39.882690690303 sum_current_gini_1= 39.155230177731006 sum_current_gini_2= 0.0
loop  3
sum_current_gini= 40.261761892440994 sum_current_gini_1= 39.882690690303 sum_current_gini_2= 39.155230177731006
loop  4
sum_current_gini= 40.695864795424995 sum_current_gini_1= 40.261761892440994 sum_current_gini_2= 39.882690690303
loop  5
sum_current_gini= 41.001111378108995 sum_current_gini_1= 40.695864795424995 sum_current_gini_2= 40.261761892440994
loop  6
sum_current_gini= 41.312834453183015 sum_current_gini_1= 41.001111378108995 sum_current_gini_2= 40.695864795424995
loop  7
sum_current_gini= 41.68121124040199 sum_current_gini_1= 41.312834453183015 sum_current_gini_2= 41.001111378108995
loop  8
sum_current_gini= 41.85896935276799 sum_current_gini_1= 41.68121124040199 sum_current_gini_2= 41.312834453183015
loop  9
sum_current_gini= 42.110332581631 sum_current_gini_1= 41.85896935

Grade: 9.899, KS: 0.4177441566, GINI: 0.560239
['ib_var_2', 'ib_var_3', 'ib_var_4', 'ib_var_7', 'ib_var_8', 'ib_var_9', 'ib_var_10', 'ib_var_11', 'ib_var_13', 'ib_var_15', 'ib_var_17', 'ib_var_18', 'ib_var_19', 'ib_var_20', 'ib_var_21', 'icn_var_23', 'icn_var_24', 'ico_var_26', 'ico_var_28', 'ico_var_30', 'ico_var_32', 'ico_var_33', 'ico_var_35', 'ico_var_36', 'ico_var_37', 'ico_var_39', 'ico_var_41', 'ico_var_42', 'ico_var_45', 'ico_var_46', 'ico_var_48', 'ico_var_49', 'ico_var_50', 'ico_var_51', 'ico_var_53', 'ico_var_54', 'ico_var_55', 'ico_var_56', 'ico_var_57', 'ico_var_58', 'ico_var_61', 'ico_var_63', 'ico_var_64', 'if_var_65', 'if_var_66', 'if_var_67', 'if_var_69', 'if_var_71', 'if_var_73', 'if_var_74', 'if_var_75', 'if_var_76', 'if_var_78', 'if_var_79', 'if_var_80']
sum_current_gini= 44.34089615250501 sum_current_gini_1= 44.28444069024398 sum_current_gini_2= 44.244032507174985
loop  31
Grade: 9.887, KS: 0.4172289023, GINI: 0.558648878455
['ib_var_1', 'ib_var_2', 'ib_var_3', 'ib

Grade: 9.948, KS: 0.419794776292, GINI: 0.560239
['ib_var_2', 'ib_var_3', 'ib_var_4', 'ib_var_5', 'ib_var_6', 'ib_var_7', 'ib_var_10', 'ib_var_11', 'ib_var_12', 'ib_var_13', 'ib_var_14', 'ib_var_15', 'ib_var_16', 'ib_var_17', 'ib_var_18', 'ib_var_19', 'ib_var_20', 'ib_var_21', 'icn_var_23', 'icn_var_24', 'ico_var_26', 'ico_var_28', 'ico_var_30', 'ico_var_32', 'ico_var_33', 'ico_var_34', 'ico_var_35', 'ico_var_36', 'ico_var_37', 'ico_var_38', 'ico_var_39', 'ico_var_40', 'ico_var_43', 'ico_var_44', 'ico_var_45', 'ico_var_46', 'ico_var_48', 'ico_var_49', 'ico_var_51', 'ico_var_53', 'ico_var_54', 'ico_var_55', 'ico_var_56', 'ico_var_57', 'ico_var_58', 'ico_var_64', 'if_var_65', 'if_var_66', 'if_var_67', 'if_var_69', 'if_var_72', 'if_var_73', 'if_var_74', 'if_var_77', 'if_var_78', 'if_var_79']
Grade: 9.907, KS: 0.418088429653, GINI: 0.555832308986
['ib_var_2', 'ib_var_4', 'ib_var_6', 'ib_var_10', 'ib_var_11', 'ib_var_12', 'ib_var_13', 'ib_var_14', 'ib_var_15', 'ib_var_16', 'ib_var_17', 'ib_

Grade: 9.837, KS: 0.415108965887, GINI: 0.558170593074
['ib_var_2', 'ib_var_3', 'ib_var_4', 'ib_var_5', 'ib_var_6', 'ib_var_7', 'ib_var_8', 'ib_var_9', 'ib_var_10', 'ib_var_11', 'ib_var_13', 'ib_var_14', 'ib_var_15', 'ib_var_17', 'ib_var_18', 'ib_var_19', 'ib_var_20', 'ib_var_21', 'icn_var_23', 'icn_var_24', 'ico_var_26', 'ico_var_28', 'ico_var_30', 'ico_var_32', 'ico_var_33', 'ico_var_34', 'ico_var_36', 'ico_var_37', 'ico_var_39', 'ico_var_40', 'ico_var_43', 'ico_var_44', 'ico_var_45', 'ico_var_46', 'ico_var_48', 'ico_var_49', 'ico_var_50', 'ico_var_51', 'ico_var_53', 'ico_var_54', 'ico_var_55', 'ico_var_56', 'ico_var_57', 'ico_var_58', 'ico_var_60', 'ico_var_61', 'if_var_65', 'if_var_67', 'if_var_69', 'if_var_71', 'if_var_73', 'if_var_74', 'if_var_77', 'if_var_78', 'if_var_79']
Grade: 9.864, KS: 0.416275797373, GINI: 0.553060795386
['ib_var_2', 'ib_var_3', 'ib_var_4', 'ib_var_5', 'ib_var_6', 'ib_var_8', 'ib_var_10', 'ib_var_11', 'ib_var_13', 'ib_var_14', 'ib_var_15', 'ib_var_17', 'ib

KeyboardInterrupt: 

### Final Model with GA Feature Selection

In [18]:
final_variables = ['ib_var_2', 'ib_var_3', 'ib_var_6', 'ib_var_7', 'ib_var_8', 'ib_var_9', 'ib_var_10', 'ib_var_11', 'ib_var_12', 'ib_var_13', 'ib_var_14', 'ib_var_15', 'ib_var_17', 'ib_var_18', 'ib_var_19', 'ib_var_20', 'ib_var_21', 'icn_var_22', 'icn_var_23', 'icn_var_24', 'ico_var_26', 'ico_var_28', 'ico_var_30', 'ico_var_32', 'ico_var_33', 'ico_var_34', 'ico_var_35', 'ico_var_36', 'ico_var_37', 'ico_var_38', 'ico_var_39', 'ico_var_40', 'ico_var_43', 'ico_var_44', 'ico_var_45', 'ico_var_46', 'ico_var_48', 'ico_var_49', 'ico_var_51', 'ico_var_53', 'ico_var_54', 'ico_var_55', 'ico_var_56', 'ico_var_57', 'ico_var_58', 'ico_var_61', 'ico_var_63', 'ico_var_64', 'if_var_65', 'if_var_66', 'if_var_67', 'if_var_69', 'if_var_72', 'if_var_73', 'if_var_74', 'if_var_77', 'if_var_78', 'if_var_79', 'if_var_80']
output_var = 'ob_target'

X = df[final_variables]
y = df[output_var]
Xo = dfo[final_variables]

# Imputing NaN values OOT
imp = SimpleImputer(strategy='mean')
imp.fit(Xo)
Xo = imp.transform(Xo)

Final_Model= RandomForestClassifier(n_estimators=10000, oob_score=True, random_state=25 ,max_features='log2', bootstrap = 'True', min_samples_split = 2, min_samples_leaf = 1, criterion = 'entropy', n_jobs=5)
Final_Model.fit(X,y)
pred_dev = Final_Model.predict_proba(X)[:,1]
pred_oot = Final_Model.predict_proba(Xo)[:,1]

#### Model Evaluation

In [19]:
# CALCULATING GINI PERFORMANCE ON DEVELOPMENT SAMPLE
from sklearn.metrics import roc_auc_score
gini_score = 2*roc_auc_score(y, pred_dev)-1
print ("GINI DEVELOPMENT=", gini_score)

def KS(b,a):  
    """Function that received two parameters; first: a binary variable representing 0=good and 1=bad, 
    and then a second variable with the prediction of the first variable, the second variable can be continuous, 
    integer or binary - continuous is better. Finally, the function returns the KS Statistics of the two lists."""
    try:
        tot_bads=1.0*sum(b)
        tot_goods=1.0*(len(b)-tot_bads)
        elements = zip(*[a,b])
        elements = sorted(elements,key= lambda x: x[0])
        elements_df = pd.DataFrame({'probability': b,'gbi': a})
        pivot_elements_df = pd.pivot_table(elements_df, values='probability', index=['gbi'], aggfunc=[sum,len]).fillna(0)
        max_ks = perc_goods = perc_bads = cum_perc_bads = cum_perc_goods = 0
        for i in range(len(pivot_elements_df)):
            perc_goods =  (pivot_elements_df.iloc[i]['len'] - pivot_elements_df.iloc[i]['sum']) / tot_goods
            perc_bads = pivot_elements_df.iloc[i]['sum']/ tot_bads
            cum_perc_goods += perc_goods
            cum_perc_bads += perc_bads
            A = cum_perc_bads-cum_perc_goods
            if abs(A['probability']) > max_ks:
                max_ks = abs(A['probability'])
    except:
        max_ks = 0
    return max_ks


KS_score = KS(y,pred_dev)
print ("KS DEVELOPMENT=", KS_score) 

GINI DEVELOPMENT= 1.0
KS DEVELOPMENT= 0.9999999999999971


### Submit Predictions

In [20]:
# OOT Predictions
dfo['pred'] = pred_oot
dfo_tosend = dfo[list(['id','pred'])]

i=1
filename = "student_sub"+str(i)+".csv"
dfo_tosend.to_csv(filename, sep=',')

url = 'http://mfalonso.pythonanywhere.com/api/v1.0/uploadpredictions'

files = {'file': (filename, open(filename, 'rb'))}
rsub = requests.post(url, files=files, auth=HTTPBasicAuth('Diego.Cuartas', 'Cuartas3458!'))

resp_str = str(rsub.text)
print ("RESULT SUBMISSION: ", resp_str)

RESULT SUBMISSION:  KS2 = 0.425513405854; GINI = 0.560239; GRADE = 10.0
