In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

<br>

### Data preprocessing

In [163]:
# loading data set
raw_data = pd.read_csv("./raw_data.csv.csv", header=None)

# seperating features and labels
X = raw_data.drop([60], axis=1)
y = raw_data[[60]]

# selecting features
X = X[[0, 2, 3, 6, 7, 8, 10, 11, 22, 23, 30, 35, 36, 38, 39, 44, 45, 47, 48, 49, 50, 51, 53]]

# splitting into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

# normalizing data
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
# flattening array
y_train = np.ravel(y_train)

In [164]:
X.shape

(208, 23)

<br>

### Training model

In [157]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import cross_val_score

In [180]:
# creating model
logistic_regression = LogisticRegression(C=0.2)

In [181]:
# training model
logistic_regression.fit(X_train, y_train)

In [182]:
# getting cross validation accuracy score
scores = cross_val_score(logistic_regression, X_train, y_train, scoring='accuracy', cv=5)
scores.mean()

0.8011379800853484

In [183]:
# predictions on training set
predictions = logistic_regression.predict(X_train)

In [184]:
# accuracy on train set
accuracy_score(y_train, predictions)

0.8449197860962567

<br>

### Checking performance on test set

In [185]:
# transforming test data
X_test = scalar.transform(X_test)
y_test = np.ravel(y_test)

In [187]:
# making predictions
predictions = logistic_regression.predict(X_test)

In [188]:
# checking shape
predictions.shape

(21,)

In [190]:
# calculating accuracy
accuracy_score(y_test, predictions)

0.8095238095238095

In [191]:
# calculating recall
recall_score(y_test, predictions, pos_label='M')

0.9090909090909091

In [192]:
# calculating precision
precision_score(y_test, predictions, pos_label='M')

0.7692307692307693

<br>

### Saving and loading model

In [194]:
import joblib

In [195]:
# save model as 'logistic_regression.pkl'
joblib.dump(logistic_regression, "logistic_regression.pkl")

['logistic_regression.pkl']

In [196]:
# load model
loaded_model = joblib.load("logistic_regression.pkl")

In [197]:
# same accuracy
accuracy_score(y_test, loaded_model.predict(X_test))

0.8095238095238095