In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
df_data = pd.read_pickle('df_data.pkl')
df_data

Unnamed: 0,image_id,x,y,0,1,2,3,4,5,6,...,429,430,431,432,433,434,435,436,437,label
0,129D1,42,234,0.183240,0.183240,0.177040,0.181723,0.176845,0.182095,0.176937,...,0.390600,0.394394,0.398004,0.402821,0.183240,0.183240,0.183240,0.183240,0.0,1
1,129D1,33,156,0.183240,0.183240,0.179713,0.177953,0.174137,0.177956,0.171644,...,0.415097,0.417611,0.421339,0.430918,0.183240,0.183240,0.183240,0.183240,0.0,1
2,129D1,78,78,0.183240,0.183240,0.177950,0.170932,0.168615,0.170199,0.166566,...,0.401058,0.406532,0.407522,0.410457,0.419562,0.183240,0.183240,0.183240,0.0,1
3,129D1,42,147,0.183240,0.183240,0.179855,0.182989,0.178902,0.182606,0.177878,...,0.423789,0.430713,0.436174,0.441805,0.183240,0.183240,0.183240,0.183240,0.0,1
4,129D1,42,235,0.183240,0.183240,0.172990,0.176540,0.171127,0.176303,0.171870,...,0.377796,0.379615,0.383085,0.388312,0.183240,0.183240,0.183240,0.183240,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586594,05814,59,147,0.161424,0.161424,0.165725,0.164123,0.161058,0.160869,0.160224,...,0.358010,0.359653,0.364118,0.368421,0.161424,0.161424,0.161424,0.161424,0.0,0
586595,05814,52,148,0.161424,0.161424,0.172172,0.170590,0.168009,0.167722,0.166930,...,0.395572,0.396083,0.399904,0.405905,0.161424,0.161424,0.161424,0.161424,0.0,0
586596,05814,57,148,0.161424,0.161424,0.166005,0.164945,0.161472,0.161788,0.160700,...,0.364860,0.364178,0.365183,0.368164,0.161424,0.161424,0.161424,0.161424,0.0,0
586597,05814,57,149,0.161424,0.161424,0.161848,0.160963,0.157673,0.158131,0.156798,...,0.362013,0.361641,0.362803,0.366092,0.161424,0.161424,0.161424,0.161424,0.0,0


In [4]:
feature_columns = list(range(0, 438))
duplicates = df_data.duplicated(subset=feature_columns, keep=False)

num_duplicates = duplicates.sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 46


In [5]:
df_data = df_data[~duplicates]

duplicates = df_data.duplicated(subset=feature_columns, keep=False)

num_duplicates = duplicates.sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


In [6]:
zero_coordinates = df_data[(df_data['x'] == 0) | (df_data['y'] == 0)]

num_zero_coordinates = len(zero_coordinates)

print(f'Number of zero coordinates rows: {num_zero_coordinates}')

Number of zero coordinates rows: 0


In [7]:
missing_count = df_data.isnull().sum().sum()

print(f'Number of missing values: {missing_count}')

Number of missing values: 0


In [8]:
constant_features = [col for col in df_data.columns if df_data[col].nunique() == 1]

print(f'Number of constant features: {len(constant_features)}')

Number of constant features: 1


In [9]:
num_unique_elements = df_data['image_id'].nunique()
print(f'Number of images: {num_unique_elements}')

Number of images: 337


In [46]:
# Extract unique image_ids
image_ids = df_data['image_id'].unique()

# Split the unique image_ids into training and testing sets
train_image_ids, test_image_ids = train_test_split(image_ids, test_size=0.2)

# Find the intersection of train_image_ids and test_image_ids
intersection = np.intersect1d(train_image_ids, test_image_ids)

# Check if there is any intersection between training set and test set
if intersection.size > 0:
    print("There is an intersection between training set and test set.")
else:
    print("There is no intersection between training set and test set.")

There is no intersection between training set and test set.


In [47]:
# Split the dataset using image_id

# Create the training dataset (train_df) by filtering rows with image_id that is in the train_image_ids array.
train_df = df_data[df_data['image_id'].isin(train_image_ids)]

# Create the testing dataset (test_df) by filtering rows with image_id that is in the test_image_ids array.
test_df = df_data[df_data['image_id'].isin(test_image_ids)]

In [56]:
test_df.to_pickle("test_df.pkl")

In [49]:
# The last column is the label and first three columns are 'image_id', 'x', 'y'
features = train_df.columns[3:-1]
label = train_df.columns[-1]

scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_df[features])
X_test = scaler.transform(test_df[features])

y_train = train_df[label].values
y_test = test_df[label].values

X_train, y_train = shuffle(X_train, y_train)

rus = RandomUnderSampler(random_state=42)  
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [44]:
print(Counter(y_train))
print(Counter(y_resampled))

Counter({0: 467609, 1: 257236})
Counter({0: 257236, 1: 257236})


In [50]:
lr = LogisticRegression(solver='newton-cg', n_jobs=-1)
lr.fit(X_resampled, y_resampled)

predictions = lr.predict(X_test)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Confusion Matrix:
 [[91303 12373]
 [ 5028 60556]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.88      0.91    103676
           1       0.83      0.92      0.87     65584

    accuracy                           0.90    169260
   macro avg       0.89      0.90      0.89    169260
weighted avg       0.90      0.90      0.90    169260



In [51]:
lr = LogisticRegression(solver='newton-cg', n_jobs=-1)
lr.fit(X_train, y_train)

predictions = lr.predict(X_test)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Confusion Matrix:
 [[95852  7824]
 [ 7113 58471]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.93    103676
           1       0.88      0.89      0.89     65584

    accuracy                           0.91    169260
   macro avg       0.91      0.91      0.91    169260
weighted avg       0.91      0.91      0.91    169260



In [52]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_resampled, y_resampled)

predictions = rf.predict(X_test)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Confusion Matrix:
 [[36112 67564]
 [ 1603 63981]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.35      0.51    103676
           1       0.49      0.98      0.65     65584

    accuracy                           0.59    169260
   macro avg       0.72      0.66      0.58    169260
weighted avg       0.77      0.59      0.56    169260



In [53]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Confusion Matrix:
 [[37567 66109]
 [ 2087 63497]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.36      0.52    103676
           1       0.49      0.97      0.65     65584

    accuracy                           0.60    169260
   macro avg       0.72      0.67      0.59    169260
weighted avg       0.77      0.60      0.57    169260



In [54]:
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_resampled, y_resampled)

predictions = knn.predict(X_test)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Confusion Matrix:
 [[53713 49963]
 [ 6944 58640]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.52      0.65    103676
           1       0.54      0.89      0.67     65584

    accuracy                           0.66    169260
   macro avg       0.71      0.71      0.66    169260
weighted avg       0.75      0.66      0.66    169260



In [55]:
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_train, y_train)

predictions = knn.predict(X_test)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Confusion Matrix:
 [[57956 45720]
 [ 8692 56892]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.56      0.68    103676
           1       0.55      0.87      0.68     65584

    accuracy                           0.68    169260
   macro avg       0.71      0.71      0.68    169260
weighted avg       0.75      0.68      0.68    169260

