In [1]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from joblib import dump, load
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold, train_test_split, cross_val_predict

In [3]:
def compute_avg_classification_report(model, X_train, y_train, groups):
    cv = GroupKFold(n_splits=5)
    
    # Get cross-validation predictions for entire dataset
    y_pred = cross_val_predict(model, X_train, y_train, groups=groups, cv=cv, n_jobs=-1)

    # Compute classification report
    report = classification_report(y_train, y_pred, output_dict=True)
    
    return report

In [4]:
df_data = pd.read_pickle('df_data.pkl')
df_data

Unnamed: 0,image_id,x,y,0,1,2,3,4,5,6,...,429,430,431,432,433,434,435,436,437,label
0,129D1,42,234,0.183240,0.183240,0.177040,0.181723,0.176845,0.182095,0.176937,...,0.390600,0.394394,0.398004,0.402821,0.183240,0.183240,0.183240,0.183240,0.0,1
1,129D1,33,156,0.183240,0.183240,0.179713,0.177953,0.174137,0.177956,0.171644,...,0.415097,0.417611,0.421339,0.430918,0.183240,0.183240,0.183240,0.183240,0.0,1
2,129D1,78,78,0.183240,0.183240,0.177950,0.170932,0.168615,0.170199,0.166566,...,0.401058,0.406532,0.407522,0.410457,0.419562,0.183240,0.183240,0.183240,0.0,1
3,129D1,42,147,0.183240,0.183240,0.179855,0.182989,0.178902,0.182606,0.177878,...,0.423789,0.430713,0.436174,0.441805,0.183240,0.183240,0.183240,0.183240,0.0,1
4,129D1,42,235,0.183240,0.183240,0.172990,0.176540,0.171127,0.176303,0.171870,...,0.377796,0.379615,0.383085,0.388312,0.183240,0.183240,0.183240,0.183240,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586594,05814,59,147,0.161424,0.161424,0.165725,0.164123,0.161058,0.160869,0.160224,...,0.358010,0.359653,0.364118,0.368421,0.161424,0.161424,0.161424,0.161424,0.0,0
586595,05814,52,148,0.161424,0.161424,0.172172,0.170590,0.168009,0.167722,0.166930,...,0.395572,0.396083,0.399904,0.405905,0.161424,0.161424,0.161424,0.161424,0.0,0
586596,05814,57,148,0.161424,0.161424,0.166005,0.164945,0.161472,0.161788,0.160700,...,0.364860,0.364178,0.365183,0.368164,0.161424,0.161424,0.161424,0.161424,0.0,0
586597,05814,57,149,0.161424,0.161424,0.161848,0.160963,0.157673,0.158131,0.156798,...,0.362013,0.361641,0.362803,0.366092,0.161424,0.161424,0.161424,0.161424,0.0,0


In [5]:
feature_columns = list(range(0, 438))
duplicates = df_data.duplicated(subset=feature_columns, keep=False)

num_duplicates = duplicates.sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 46


In [6]:
df_data = df_data[~duplicates]

duplicates = df_data.duplicated(subset=feature_columns, keep=False)

num_duplicates = duplicates.sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


In [7]:
zero_coordinates = df_data[(df_data['x'] == 0) | (df_data['y'] == 0)]

num_zero_coordinates = len(zero_coordinates)

print(f'Number of zero coordinates rows: {num_zero_coordinates}')

Number of zero coordinates rows: 0


In [8]:
missing_count = df_data.isnull().sum().sum()

print(f'Number of missing values: {missing_count}')

Number of missing values: 0


In [9]:
constant_features = [col for col in df_data.columns if df_data[col].nunique() == 1]

print(f'Number of constant features: {len(constant_features)}')

Number of constant features: 1


In [10]:
num_unique_elements = df_data['image_id'].nunique()
print(f'Number of images: {num_unique_elements}')

Number of images: 337


In [11]:
remove = ['0AA7D', '040FF', '09E4C', '0A425']

df_data = df_data[~df_data['image_id'].isin(remove)]

In [12]:
num_unique_elements = df_data['image_id'].nunique()
print(f'Number of images: {num_unique_elements}')

Number of images: 333


In [13]:
features = df_data.columns[3:-1]
label = df_data.columns[-1]

scaler = MinMaxScaler()
X_data = scaler.fit_transform(df_data[features])

y_data = df_data[label].values

groups = df_data["image_id"].values
X_data, y_data, groups = shuffle(X_data, y_data, groups)

In [14]:
counts = np.bincount(y_data)
num_zeros = counts[0]
num_ones = counts[1]

print(f"Number of zeros: {num_zeros}")
print(f"Number of ones: {num_ones}")

Number of zeros: 572522
Number of ones: 318877


In [15]:
lr = LogisticRegression(solver='newton-cg', n_jobs=-1)

lr.fit(X_data, y_data)

In [16]:
# Save the model and scaler
dump(lr, 'logistic_regression_model.joblib')
dump(scaler, 'min_max_scaler.joblib')

['min_max_scaler.joblib']

In [None]:
report = compute_avg_classification_report(lr, X_data, y_data, groups)

In [14]:
report

{'0': {'precision': 0.9351687901419777,
  'recall': 0.8841803960612095,
  'f1-score': 0.9089601023510958,
  'support': 586576},
 '1': {'precision': 0.8085521291555238,
  'recall': 0.8886418047739542,
  'f1-score': 0.846707272448269,
  'support': 322877},
 'accuracy': 0.8857643000792784,
 'macro avg': {'precision': 0.8718604596487507,
  'recall': 0.8864111004175819,
  'f1-score': 0.8778336873996824,
  'support': 909453},
 'weighted avg': {'precision': 0.8902169260551879,
  'recall': 0.8857643000792784,
  'f1-score': 0.886858897604358,
  'support': 909453}}