In [1]:
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OrdinalEncoder
import random


In [2]:
df_font_names = pd.read_csv("../../Datasets/CharacterFontImages/font.names", header=None)
frames: list[pd.DataFrame] = []

for font_name in df_font_names[0]:
    df = pd.read_csv(f"../../Datasets/CharacterFontImages/{font_name}")
    frames.append(df)

df = pd.concat(frames)


In [3]:
ordinal_encoder = OrdinalEncoder()

for column in df.columns:
    if df[column].dtype == "object":
        df[column] = ordinal_encoder.fit_transform(df[column].values.reshape(-1, 1))

df["font"] = df["font"].astype("int64").astype("category")

df.head()


Unnamed: 0,font,fontVariant,m_label,strength,italic,orientation,m_top,m_left,originalH,originalW,...,r19c10,r19c11,r19c12,r19c13,r19c14,r19c15,r19c16,r19c17,r19c18,r19c19
0,0,0.0,64258,0.4,0,0.0,35,21,51,22,...,1,1,1,1,1,1,163,255,255,255
1,0,0.0,64257,0.4,0,0.0,35,21,51,22,...,1,1,1,1,1,1,163,255,255,255
2,0,0.0,61442,0.4,0,0.0,35,21,51,22,...,1,1,1,1,1,1,163,255,255,255
3,0,0.0,61441,0.4,0,0.0,35,21,51,22,...,1,1,1,1,1,1,163,255,255,255
4,0,0.0,9674,0.4,0,0.0,51,21,33,25,...,255,132,1,1,1,1,1,1,1,1


In [4]:
row_sample_indices = random.sample(range(df.shape[0]), int(df.shape[0] * 0.0001))
column_sample_indices = random.sample(range(1, df.shape[1]), int(df.shape[1] * 0.05))

X = df.iloc[row_sample_indices, column_sample_indices]
y = df.iloc[row_sample_indices, :1]


In [5]:
import warnings
warnings.filterwarnings("ignore")

linear_svc = LinearSVC()
backward_selector = SequentialFeatureSelector(
    linear_svc, scoring="accuracy", n_features_to_select=17, direction="backward")
backward_selector.fit(X, y)

selected_features_mask = backward_selector.get_support()
selected_feature_indices = [0] + [column_sample_indices[i]
                                  for (i, x) in enumerate(selected_features_mask) if x]

print("Before Backward feature elimination:", df.shape)
print("After Backward feature elimination:", df.iloc[:, selected_feature_indices].shape)

df.iloc[:, selected_feature_indices].head()


Before Backward feature elimination: (738982, 412)
After Backward feature elimination: (738982, 18)


Unnamed: 0,font,r12c5,r15c17,orientation,r14c16,r3c12,strength,r14c10,r4c6,r3c19,r5c5,r3c10,r13c13,r1c16,r8c14,r5c11,r8c17,r4c7
0,0,255,255,0.0,163,1,0.4,1,255,255,255,1,1,216,1,1,255,70
1,0,255,255,0.0,163,1,0.4,1,255,255,255,1,1,216,255,1,255,70
2,0,255,255,0.0,163,1,0.4,1,255,255,255,1,1,216,1,1,255,70
3,0,255,255,0.0,163,1,0.4,1,255,255,255,1,1,216,255,1,255,70
4,0,181,1,0.0,1,255,0.4,1,255,1,255,98,205,1,75,1,255,255


In [6]:
df_new = df.iloc[:, selected_feature_indices]
df_new.to_csv("../../DatasetsFeatureSelection/CharacterFontImages/character_font_images.csv", index=False)

df_new.head()


Unnamed: 0,font,r12c5,r15c17,orientation,r14c16,r3c12,strength,r14c10,r4c6,r3c19,r5c5,r3c10,r13c13,r1c16,r8c14,r5c11,r8c17,r4c7
0,0,255,255,0.0,163,1,0.4,1,255,255,255,1,1,216,1,1,255,70
1,0,255,255,0.0,163,1,0.4,1,255,255,255,1,1,216,255,1,255,70
2,0,255,255,0.0,163,1,0.4,1,255,255,255,1,1,216,1,1,255,70
3,0,255,255,0.0,163,1,0.4,1,255,255,255,1,1,216,255,1,255,70
4,0,181,1,0.0,1,255,0.4,1,255,1,255,98,205,1,75,1,255,255
