## 5. Training models

- Logistic regression (perceptron)
- Naive Bayes
- Support vector machine (SVM)

In [None]:
# Remove the target variable
X = df_train.drop(["is_canceled"], axis=1)

In [None]:
# Get the target variable
y = df_train["is_canceled"]

In [None]:
# split the dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1) # 60/40

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.5, random_state=1) # 20/20

In [None]:
**Import algorithms for classification**

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver='lbfgs', max_iter=10000)
lr_model.fit(X_train, y_train)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)

## 6. Evaluating models

**metrics:**

- Confusion matrix - `confusion_matrix()`
- Classification report - `classification_report()`
- Precision - `precision_score()`
- F1-score - `f1_score()`

In [None]:
# Accuracy
print("Scores of the models")
print("Logistic regression:", lr_model.score(X_val, y_val))
print("Naive Bayes:", nb_model.score(X_val, y_val))
print("SVM:", svm_model.score(X_val, y_val))

In [None]:
# make predictions on training data 
lr_y_pred = lr_model.predict(X_val)
nb_y_pred = nb_model.predict(X_val)
svm_y_pred = svm_model.predict(X_val)

In [None]:
from sklearn.metrics import f1_score

print("F1-scores of the models:")
print("Logistic regression:", f1_score(y_val, lr_y_pred))
print("Naive Bayes:", f1_score(y_val, nb_y_pred))
print("Support Vector Machine:", f1_score(y_val, svm_y_pred))

In [None]:
from sklearn.metrics import precision_score

print("precision of the models:")
print("Logistic regression:", precision_score(y_val, lr_y_pred))
print("Naive Bayes:", precision_score(y_val, nb_y_pred))
print("Support Vector Machine:", precision_score(y_val, svm_y_pred))

In [None]:
## 7. Testing the model

In [None]:
# Plotting confusion matrix for the different models for the Training Data

from sklearn import metrics

f,a =  plt.subplots(1,2,sharex=True,sharey=True,squeeze=False)

predict_train = nb_model.fit(X_train, y_train).predict(X_train)
predict_test = nb_model.fit(X_test, y_test).predict(X_test)


plot_0 = sns.heatmap((metrics.confusion_matrix(y_train, predict_train)),annot=True,fmt='.5g',cmap="YlGn",ax=a[0][0]);
a[0][0].set_title('Training Data')

plot_1 = sns.heatmap((metrics.confusion_matrix(y_test, predict_test)),annot=True,fmt='.5g',cmap="YlGn",ax=a[0][1]);
a[0][1].set_title('Test Data');

In [None]:
# classification report

from sklearn.metrics import classification_report

print(classification_report(y_test, predict_test))

In [None]:
**Use the model**

In [None]:
hotel = X_test.iloc[10]
hotel_dict = hotel.to_dict()
hotel_dict

In [None]:
nb_model.predict_proba([hotel])[0, 1]

In [None]:
y_test.iloc[10]