## Titanic Survival Prediction Competition
Let's share our models to a centralized leaderboard, so that we can collaborate and learn from the model experimentation process...

**Instructions:**
1.   Get data in and set up X_train / X_test / y_train
2.   Preprocess data using Sklearn Column Transformer / Write and Save Preprocessor function
3. Fit model on preprocessed data and save preprocessor function and model 
4. Generate predictions from X_test data and submit model to competition
5. Repeat submission process to improve place on leaderboard



## 1. Get data in and set up X_train, X_test, y_train objects

In [1]:
#install aimodelshare library
#! pip install aimodelshare --upgrade

In [2]:
# Get competition data
from aimodelshare import download_data
#download_data('public.ecr.aws/x8e4b0t0/titanic_competition_data-repository:latest') 

In [3]:
# Separate data into X_train, y_train, and X_test
import pandas as pd
full_training_data=pd.read_csv("titanic_competition_data/training_data.csv")

X_train=full_training_data.iloc[:,full_training_data.columns!='survived']
X_test=pd.read_csv("titanic_competition_data/test_data.csv")
y_train=full_training_data['survived']

X_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,pclass,sex,age,fare,embarked
0,1117,1117,3,male,21.0,7.925,S
1,26,26,1,male,25.0,91.0792,C
2,1129,1129,3,male,25.0,7.775,S
3,1103,1103,3,male,2.0,39.6875,S
4,946,946,3,male,,56.4958,S


In [14]:
# import numpy as np

# data = pd.concat([X_train, X_test], sort=False)

# data['sex'].replace(['male','female'], [0, 1], inplace=True)

# data['embarked'].fillna(('S'), inplace=True)
# data['embarked'] = data['embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

# data['fare'].fillna(np.mean(data['fare']), inplace=True)

# age_avg = data['age'].mean()
# age_std = data['age'].std()

# data['age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

# X_train = data[:len(X_train)]
# X_test = data[len(X_train):]
# X_test.head()

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

##2.   Preprocess data using Sklearn Column Transformer / Write and Save Preprocessor function


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#Preprocess data using sklearn's Column Transformer approach

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['embarked', 'sex','age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), #'imputer' names the step
    ('scaler', StandardScaler())])

categorical_features = [ 'pclass']

# Replacing missing values with Modal value and then one-hot encoding.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Final preprocessor object set up with ColumnTransformer...

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# fit preprocessor to your data
preprocess = preprocess.fit(X_train)

In [16]:
# Write function to transform data with preprocessor 
# In this case we use sklearn's Column transformer in our preprocessor function

def preprocessor(data):
    preprocessed_data=preprocess.transform(data)
    return preprocessed_data

##3. Fit model on preprocessed data and save preprocessor function and model 


In [17]:
from sklearn.linear_model import LogisticRegression

# abel
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
from sklearn.tree import DecisionTreeClassifier
# model = DecisionTreeClassifier()



# model = LogisticRegression(C=10, penalty='l1', solver = 'liblinear')
model.fit(preprocessor(X_train), y_train) # Fitting to the training set.
model.score(preprocessor(X_train), y_train) # Fit score, 0-1 scale.

# logisticRegress : 0.7822349570200573
# RandomForestClassifier 0.9694364851957975

0.8376313276026743

#### Save preprocessor function to local "preprocessor.zip" file

In [9]:
import aimodelshare as ai
ai.export_preprocessor(preprocessor,"") 

Your preprocessor is now saved to 'preprocessor.zip'


#### Save model to local ".onnx" file

In [10]:
# Save sklearn model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

# Check how many preprocessed input features are there?
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 10]))]  # You need to insert correct number of features in preprocessed data

onnx_model = model_to_onnx(model, framework='sklearn',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=False)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

## 4. Generate predictions from X_test data and submit model to competition


In [11]:
#Set credentials using modelshare.org username/password

apiurl='https://r2okzbjyhh.execute-api.us-east-1.amazonaws.com/prod/m'

ai.set_credentials(apiurl=apiurl)


AI Modelshare Username:········
AI Modelshare Password:········
Credential confirmation unsuccessful. Check username & password and try again.


In [12]:
#Instantiate Competition

mycompetition= ai.Competition(apiurl)

In [13]:
#Submit Model 1: 

#-- Generate predicted values (a list of predicted labels "survived" or "died") (Model 1)
prediction_labels = model.predict(preprocessor(X_test))
print('prediction_labels=',prediction_labels)
print(len(prediction_labels))
# Submit Model 1 to Competition Leaderboard

# mycompetition.submit_model(model_filepath = "model.onnx",
#                                  preprocessor_filepath="preprocessor.zip",
#                                  prediction_submission=prediction_labels)

prediction_labels= ['died' 'died' 'died' 'survived' 'died' 'survived' 'died' 'survived'
 'died' 'died' 'died' 'died' 'survived' 'died' 'survived' 'died' 'died'
 'died' 'died' 'survived' 'survived' 'died' 'died' 'survived' 'survived'
 'died' 'died' 'died' 'survived' 'survived' 'died' 'died' 'survived'
 'died' 'died' 'died' 'died' 'survived' 'survived' 'survived' 'survived'
 'survived' 'died' 'died' 'survived' 'died' 'died' 'survived' 'died'
 'died' 'died' 'died' 'survived' 'died' 'died' 'survived' 'survived'
 'survived' 'died' 'died' 'died' 'died' 'died' 'died' 'died' 'died'
 'survived' 'died' 'survived' 'died' 'died' 'died' 'died' 'died'
 'survived' 'died' 'survived' 'survived' 'died' 'died' 'died' 'died'
 'died' 'died' 'survived' 'died' 'died' 'died' 'died' 'died' 'survived'
 'survived' 'died' 'died' 'died' 'died' 'died' 'died' 'survived'
 'survived' 'survived' 'died' 'died' 'died' 'survived' 'survived'
 'survived' 'survived' 'died' 'survived' 'survived' 'died' 'died' 'died'
 'died' '

In [None]:
# Get leaderboard to explore current best model architectures

# Get raw data in pandas data frame
data = mycompetition.get_leaderboard()

# Stylize leaderboard data
mycompetition.stylize_leaderboard(data)

## 5. Repeat submission process to improve place on leaderboard


In [22]:
# Train and submit model 2 using same preprocessor (note that you could save a new preprocessor, but we will use the same one for this example).
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=.01, penalty='l1', solver = 'liblinear')
model.fit(preprocessor(X_train), y_train) # Fitting to the training set.
model.score(preprocessor(X_train), y_train) # Fit score, 0-1 scale.

0.7182425978987583

In [23]:
# Save sklearn model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

# Check how many preprocessed input features are there?
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 10]))]  # You need to insert correct number of features in preprocesed data

onnx_model = model_to_onnx(model, framework='sklearn',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=False)

with open("model2.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [24]:
#Submit Model 2: 

#-- Generate predicted values (a list of predicted labels "survived" or "died") (Model 1)
prediction_labels = model.predict(preprocessor(X_test))

# Submit Model 1 to Competition Leaderboard
mycompetition.submit_model(model_filepath = "model2.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels)

timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
100% [........................................................] 163894 / 163894timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
Insert search tags to help users find your model (optional): 
Provide any useful notes about your model (optional): 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 

Your model has been submitted as model version 365

To submit code used to create this model or to view current leade

In [25]:
# Compare differences between models
# (Experimental, Git-like Diffs for Model Architectures)
mycompetition.compare_models([1,2])

timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 


Unnamed: 0,param_name,model_default,Model_1,Model_2
0,C,1.0,10,10
1,class_weight,,,
2,dual,False,False,False
3,fit_intercept,True,True,True
4,intercept_scaling,1,1,1
5,l1_ratio,,,
6,max_iter,100,100,100
7,multi_class,auto,auto,auto
8,n_jobs,,,
9,penalty,l2,l1,l1


In [26]:
# Submit a third model using GridSearchCV

from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {'C': np.arange(.1, 10, .1),'penalty':['l2']} #np.arange creates sequence of numbers for each k value

gridmodel = GridSearchCV(LogisticRegression(solver ='newton-cg'), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
gridmodel.fit(preprocessor(X_train), y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(gridmodel.best_score_))
print("best parameters: {}".format(gridmodel.best_params_))


best mean cross-validation score: 0.782
best parameters: {'C': 0.2, 'penalty': 'l2'}


In [27]:
# Save sklearn model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

# Check how many preprocessed input features are there?
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 10]))]  # You need to insert correct number of features in preprocesed data

onnx_model = model_to_onnx(gridmodel, framework='sklearn',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=False)

with open("gridmodel.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [28]:
#Submit Model 3: 

#-- Generate predicted values (a list of predicted labels "survived" or "died")
prediction_labels = gridmodel.predict(preprocessor(X_test))

# Submit Model 1 to Competition Leaderboard
mycompetition.submit_model(model_filepath = "gridmodel.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels)

timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
100% [........................................................] 164618 / 164618timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
Insert search tags to help users find your model (optional): 
Provide any useful notes about your model (optional): 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 
timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 

Your model has been submitted as model version 366

To submit code used to create this model or to view current leade

In [29]:
# Get leaderboard

data = mycompetition.get_leaderboard()
mycompetition.stylize_leaderboard(data)

timeout= None ####################
timeout= 50 abel abel abel abel abel abel abel abel abel abel 


TypeError: 'float' object is not subscriptable