<div style="text-align: center; 
                font-size: 2em; 
                font-weight: bold; 
                background-color: #ffcc00; 
                color: #000000;
                margin: 20px;
                line-height: 1.5;
                padding: 20px;">
    <h1 style="margin: 20px; font-size: 1.5em; font-weight: bold;">
        Preprocessing the CSV files
    </h1>
    <p style="margin: 20px; font-size: 0.5em; color: #444444;">
        This notebook is used to preprocess the CSV files of plate numbers.
    </p>
</div>

### **Importing the libraries and the dataset**

In [61]:
import pandas as pd

In [62]:
train = pd.read_csv("../data/train_data/DataTrain.csv")
test = pd.read_csv("../data/test_data/DataTest.csv")

num_train = train.shape[0]
num_test = test.shape[0]

train.head()

Unnamed: 0,;Vehicleregistrationplate;NameofFile
0,0;A7814;DataTrain1.png
1,1;B1074QO;DataTrain2.png
2,2;B1031QO;DataTrain3.png
3,3;B187EDA;DataTrain4.png
4,4;B1089VD;DataTrain5.png


### **Preprocessing the labels**

In [63]:
train["img"] = train[";Vehicleregistrationplate;NameofFile"].apply(
    lambda x: x.split(";")[2]
)
train["label"] = train[";Vehicleregistrationplate;NameofFile"].apply(
    lambda x: x.split(";")[1]
)
train.drop([";Vehicleregistrationplate;NameofFile"], axis=1, inplace=True)

train.head()

Unnamed: 0,img,label
0,DataTrain1.png,A7814
1,DataTrain2.png,B1074QO
2,DataTrain3.png,B1031QO
3,DataTrain4.png,B187EDA
4,DataTrain5.png,B1089VD


In [64]:
test.head()

Unnamed: 0,;Name of File
0,0;DataTest1.png
1,1;DataTest2.png
2,2;DataTest3.png
3,3;DataTest4.png
4,4;DataTest5.png


In [65]:
test["img"] = test[";Name of File"].apply(lambda x: x.split(";")[1])
test.drop([";Name of File"], axis=1, inplace=True)

test.head()

Unnamed: 0,img
0,DataTest1.png
1,DataTest2.png
2,DataTest3.png
3,DataTest4.png
4,DataTest5.png


In [66]:
# show duplicated label in train data
print(train[train["label"].duplicated()])

                  img     label
27    DataTrain28.png  B1254TFX
44    DataTrain45.png   A8014VA
58    DataTrain59.png   A8014VA
65    DataTrain66.png   B1074QO
92    DataTrain93.png  B1946TKN
..                ...       ...
784  DataTrain785.png   A1398BC
785  DataTrain786.png   B1559UN
786  DataTrain787.png  B1913TIS
790  DataTrain791.png  B1802EOZ
799  DataTrain800.png  AB6315SE

[187 rows x 2 columns]


In [67]:
train.drop_duplicates(subset="label", keep="first", inplace=True)

In [68]:
# show duplicated label in train data
print(train[train["label"].duplicated()])

Empty DataFrame
Columns: [img, label]
Index: []


In [69]:
# label
# 0	DataTrain1.png	A7814

# split label menjadi 3 kolom sesuai dengan aturan plat nomor
# A7814 -> A 7814
# B3210DA -> B 3210 DA

def splitting_label(label):
    # pertama temukan index angka pertama dan terakhir yang muncul
    # misal A7814 -> 7, 4
    
    start = 0
    end = 0
    
    for i in range(len(label)):
        if label[i].isdigit():
            start = i
            break
    for i in range(len(label)-1, -1, -1):
        if label[i].isdigit():
            end = i
            break

    # kemudian split label menjadi 3 kolom
    # ketika index angka terakhir == index string terakhir, maka untuk kolom 3 diisi dengan np.nan

    first = label[:start]
    if end == len(label)-1:
        second = label[start:end+1]
        third = ''
    else:
        second = label[start:end+1]
        third = label[end+1:]

    return first, second, third

# split label menjadi 3 kolom
train['wilayah'], train['nopol'], train['kotanjenis'] = zip(*train['label'].map(splitting_label))

train.head()

Unnamed: 0,img,label,wilayah,nopol,kotanjenis
0,DataTrain1.png,A7814,A,7814,
1,DataTrain2.png,B1074QO,B,1074,QO
2,DataTrain3.png,B1031QO,B,1031,QO
3,DataTrain4.png,B187EDA,B,187,EDA
4,DataTrain5.png,B1089VD,B,1089,VD


In [70]:
print(train[train['wilayah'] == ''])
print(train[train['nopol'] == ''])
print(train[train['kotanjenis'] == ''])

                  img label wilayah nopol kotanjenis
126  DataTrain127.png  AIIS             A        IIS
Empty DataFrame
Columns: [img, label, wilayah, nopol, kotanjenis]
Index: []
                  img     label wilayah    nopol kotanjenis
0      DataTrain1.png     A7814       A     7814           
33    DataTrain34.png     B1157       B     1157           
117  DataTrain118.png     B1747       B     1747           
177  DataTrain178.png    AB2070      AB     2070           
257  DataTrain258.png      A317       A      317           
366  DataTrain367.png     B1713       B     1713           
448  DataTrain449.png  B1855SL0       B  1855SL0           
457  DataTrain458.png   B128000       B   128000           
464  DataTrain465.png     B1487       B     1487           
600  DataTrain601.png   B130900       B   130900           


In [71]:
# remove the row when wilayah or nopol or kotanjenis is empty
train = train[train['wilayah'] != '']
train = train[train['nopol'] != '']
train = train[train['kotanjenis'] != '']

print(train[train['wilayah'] == ''])
print(train[train['nopol'] == ''])
print(train[train['kotanjenis'] == ''])

train.head()

Empty DataFrame
Columns: [img, label, wilayah, nopol, kotanjenis]
Index: []
Empty DataFrame
Columns: [img, label, wilayah, nopol, kotanjenis]
Index: []
Empty DataFrame
Columns: [img, label, wilayah, nopol, kotanjenis]
Index: []


Unnamed: 0,img,label,wilayah,nopol,kotanjenis
1,DataTrain2.png,B1074QO,B,1074,QO
2,DataTrain3.png,B1031QO,B,1031,QO
3,DataTrain4.png,B187EDA,B,187,EDA
4,DataTrain5.png,B1089VD,B,1089,VD
5,DataTrain6.png,B1972RBP,B,1972,RBP


In [72]:
# drop kolom wilayah, nopol, kotanjenis.
train.drop(['wilayah', 'nopol', 'kotanjenis'], axis=1, inplace=True)
train.head()

Unnamed: 0,img,label
1,DataTrain2.png,B1074QO
2,DataTrain3.png,B1031QO
3,DataTrain4.png,B187EDA
4,DataTrain5.png,B1089VD
5,DataTrain6.png,B1972RBP


### **Saving the preprocessed dataset**

In [73]:
print("-"*50)
print("\033[91m" + "Before CSV Preprocessing" + "\033[0m")
print("-"*50)

print("Number of train data: {}".format(num_train))
print("Number of test data: {}".format(num_test))


print("-"*50)
print("\033[91m" + "After CSV Preprocessing" + "\033[0m")
print("-"*50)

print("Number of train data: {}".format(train.shape[0]))
print("Number of test data: {}".format(test.shape[0]))
print("-"*50)

--------------------------------------------------
[91mBefore CSV Preprocessing[0m
--------------------------------------------------
Number of train data: 800
Number of test data: 100
--------------------------------------------------
[91mAfter CSV Preprocessing[0m
--------------------------------------------------
Number of train data: 602
Number of test data: 100
--------------------------------------------------


In [74]:
train.to_csv("../data/out/DataTrain.csv", index=False)
test.to_csv("../data/out/DataTest.csv", index=False)