In [50]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from supervised.automl import AutoML
from sklearn import datasets
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split

df = pd.read_csv('./data/train.csv')

# First we perform an explorative data analysis
# profile = ProfileReport(df, title='Titanic raw data analysis', explorative=True)
# profile.to_widgets() # gross overview
# profile.to_file('./analysis/pandas-analysis.html')

# From analysis, we
# 1. Drop PassengerId and Name columns as they contain only unique values, we drop cabin has it brings low significance to data
df = df.drop(['PassengerId', 'Name', 'Cabin'], axis=True)
df['Sex'] = df['Sex'].apply(lambda x: 0 if 'female' in x else 1 if 'male' in x else x)

# We found duplicates, we have to remove them
df = df.drop_duplicates()

# We re-perform an explorative data analysis
# profile = ProfileReport(df, title='Titanic raw data analysis', explorative=True)
# # profile.to_widgets() # gross overview
# profile.to_file('./analysis/pandas-analysis-2.html')

# Fare has the highest correlation with survival rate, but has high cardinality, so we categorize
fare_bins = [0, 10, 20, 50]
fare_names = ['<10', '10-20', '20-50', '50+']
fare_d = dict(enumerate(fare_names, 1))
df['Fare'] = np.vectorize(fare_d.get)(np.digitize(df['Fare'], fare_bins))

# We fill missing age values with mean of other columns
df['Age'].fillna(df['Age'].mean(), inplace=True)

# We fill missing of embarked randomly, there is no significance
df['Embarked'].fillna('Q', inplace=True)

# We categorize age to find a better significance
age_bins = [0, 10, 20, 30, 40,50]
age_names = ['<10', '10-20', '20-30', '30-40', '40-50', '50+']
age_d = dict(enumerate(age_names, 1))
df['Age'] = np.vectorize(age_d.get)(np.digitize(df['Age'], age_bins))


# Convert data
df['Pclass'] = df['Pclass'].astype(int)
df['Sex'] = df['Sex'].astype(int)
df['SibSp'] = df['SibSp'].astype(int)
df['Parch'] = df['Parch'].astype(int)

df.reset_index(drop=True, inplace=True)

# We re-perform an explorative data analysis
# profile = ProfileReport(df, title='Titanic raw data analysis 3', explorative=True)
# profile.to_widgets() # gross overview
# profile.to_file('./analysis/pandas-analysis-3.html')

# Start training
X = pd.DataFrame(df.drop(['Survived','SibSp','Parch','Embarked'], axis=True).values, columns=['Pclass', 'Sex', 'Age', 'Ticket', 'Fare'])
y = pd.Series(df['Survived'].values, name='Survived')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
print(X)
automl = AutoML(algorithms=['Decision Tree', 'Linear', 'Random Forest','Extra Trees','CatBoost','Neural Network','Nearest Neighbors'],total_time_limit=5*60)
automl.fit(X_train, y_train)

y_predicted = automl.predict(X_test)
score = accuracy_score(y_test, y_predicted["label"].astype(int))

print(pd.DataFrame({'Predicted': y_predicted['label'], 'Survived': np.array(y_test), 'Score': score}))
print("Test MSE:", mean_squared_error(y_test, y_predicted["label"]))

    Pclass Sex    Age            Ticket   Fare
0        3   1  20-30         A/5 21171    <10
1        1   0  30-40          PC 17599    50+
2        3   0  20-30  STON/O2. 3101282    <10
3        1   0  30-40            113803    50+
4        3   1  30-40            373450    <10
..     ...  ..    ...               ...    ...
871      2   1  20-30            211536  10-20
872      1   0  10-20            112053  20-50
873      3   0  20-30        W./C. 6607  20-50
874      1   1  20-30            111369  20-50
875      3   1  30-40            370376    <10

[876 rows x 5 columns]
Create directory AutoML_18
AutoML task to be solved: binary_classification
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will optimize for metric: logloss


Passing parameters norm and vmin/vmax simultaneously is deprecated since 3.3 and will become an error two minor releases later. Please pass vmin/vmax directly to the norm when creating it.


1_DecisionTree final logloss 0.4434609622032429 time 5.18 seconds


Passing parameters norm and vmin/vmax simultaneously is deprecated since 3.3 and will become an error two minor releases later. Please pass vmin/vmax directly to the norm when creating it.


2_Linear final logloss 0.394748027609029 time 2.33 seconds


Passing parameters norm and vmin/vmax simultaneously is deprecated since 3.3 and will become an error two minor releases later. Please pass vmin/vmax directly to the norm when creating it.


3_Default_RandomForest final logloss 0.3920837335816453 time 2.91 seconds


Passing parameters norm and vmin/vmax simultaneously is deprecated since 3.3 and will become an error two minor releases later. Please pass vmin/vmax directly to the norm when creating it.


4_Default_ExtraTrees final logloss 0.4045916859071745 time 3.67 seconds
5_Default_CatBoost final logloss 0.3552850790389562 time 0.23 seconds
6_Default_NeuralNetwork final logloss 0.3919646383906114 time 1.78 seconds
Exception while producing SHAP explanations. 'NoneType' object has no attribute 'shap_values'
Continuing ...
7_Default_NearestNeighbors final logloss 0.7624444971539572 time 0.62 seconds
Ensemble final logloss 0.35528430783258796 time 0.53 seconds
    Predicted  Survived     Score
0           1         1  0.795455
1           0         0  0.795455
2           0         0  0.795455
3           1         1  0.795455
4           0         0  0.795455
..        ...       ...       ...
83          1         1  0.795455
84          0         0  0.795455
85          0         1  0.795455
86          0         1  0.795455
87          1         1  0.795455

[88 rows x 3 columns]
Test MSE: 0.20454545454545456
