In [1]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from PIL import Image
import csv
from tqdm import tqdm

# 경로 설정

In [2]:
train = './archive/fashion-mnist_train.csv'
test = './archive/fashion-mnist_private_test2.csv'

# Data augmentation

In [8]:
original_train = pd.read_csv(train)
original_train_data = original_train.drop('label', axis=1)
original_train_labels = original_train['label']

In [9]:
image_list = [[] for i in range(10)]

for i in tqdm(original_train.iloc):
    image = np.array(i.drop('label')).reshape(28,28)
    image_list[i['label']].append(image)
    
# image_list.shape = (10, 6000, 28, 28)

60000it [00:17, 3452.27it/s]


In [5]:
random.seed(91)

aug_image_list = [[] for i in range(10)]

for i in range(len(image_list)):
    c = 0
    if i == 0 or i == 5 or i == 8:
        max_num = 4000
        rand_num = 7
    else :
        max_num = 3000
        rand_num = 6
    for j in range(len(image_list[0])) :
        num = random.randrange(1, 11)
        if c == max_num :
            break
        if num <= rand_num :
            image = image_list[i][j]
            temp_image = Image.fromarray(image.astype(np.uint8))
            # # 랜덤 증강
            new_image = temp_image.transform(temp_image.size, Image.AFFINE, (1, 0, -1, 0, 1, -1))
            new_image = np.array(new_image)
            aug_image_list[i].append(list(new_image))
            c += 1
        else :
            pass

In [6]:
aug_3000_list = []
aug_4000_list = []
for i in aug_image_list :
    if len(i) == 3000 :
        aug_3000_list.append(i)
    elif len(i) == 4000 :
        aug_4000_list.append(i)

aug_3000_list = np.array(aug_3000_list)
aug_4000_list = np.array(aug_4000_list)

In [7]:
labels4000 = [0, 5, 8]
labels3000 = [1, 2, 3, 4, 6, 7, 9]

In [8]:
new_data_set_3000 = []
new_data_set_4000 = []

for class_index, image_list in zip(labels3000, aug_3000_list) :
    for image in image_list :
        label_pixel = [class_index] + list(image.flatten())
        new_data_set_3000.append(label_pixel)
        

for class_index, image_list in zip(labels4000, aug_4000_list) :
    for image in image_list :
        label_pixel = [class_index] + list(image.flatten())
        new_data_set_4000.append(label_pixel)

In [9]:
new_data_set_3000 = np.array(new_data_set_3000)
new_data_set_4000 = np.array(new_data_set_4000)

np.array(new_data_set_3000).shape, np.array(new_data_set_4000).shape

((21000, 785), (12000, 785))

In [10]:
merged_aug_list = np.concatenate((new_data_set_3000, new_data_set_4000), axis=0)
print(merged_aug_list.shape)
merged_image_list = np.concatenate((np.array(original_train), merged_aug_list), axis=0)
print(merged_image_list.shape)

(33000, 785)
(93000, 785)


In [11]:
df = pd.DataFrame(columns=list(original_train.columns))

csv_file_path = './aug_train.csv'

df.to_csv(csv_file_path, index=False)
with open(csv_file_path, 'a', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)

    # 각 행을 CSV 파일에 쓰기
    for row in merged_image_list:
        csv_writer.writerow(row)

In [12]:

with open('aug_train.csv', 'r') as file:
    reader = csv.reader(file)

    header = next(reader, None)

    rows = list(reader)

random.shuffle(rows)

with open('./aug_fashion-mnist_train.csv', 'w', newline='') as file:
    writer = csv.writer(file)

    if header:
        writer.writerow(header)

    writer.writerows(rows)


# 0, 2, 4, 6 라벨 제외 1로 만들기

In [13]:
df = pd.read_csv('./aug_fashion-mnist_train.csv')  

labels_to_change = [0, 2, 4, 6]
df['label'] = df['label'].apply(lambda x: 1 if x not in labels_to_change else x)

df.to_csv('0246_fashion-mnist_train.csv', index=False)


# csv 불러오기

In [3]:
train0 = pd.read_csv('./90000_train.csv')
train1 = pd.read_csv('./aug_fashion-mnist_train.csv')
train2 = pd.read_csv('./046_fashion-mnist_train.csv')
public = pd.read_csv('/home/vision/gyuil/3-2/머신러닝/svm/archive/fashion-mnist_public_test.csv')
test = pd.read_csv(test)

#train
train0_data = train0.drop('label', axis=1)
train0_labels = train0['label']
train1_data = train1.drop('label', axis=1)
train1_labels = train1['label']
train2_data = train2.drop('label', axis=1)
train2_labels = train2['label']



#test
test_data = test.drop('label', axis=1)

#int -> float
train0_data = train0_data.astype('float32')
train1_data = train1_data.astype('float32')
train2_data = train2_data.astype('float32')
test_data = test_data.astype('float32')


public = pd.read_csv('/home/vision/gyuil/3-2/머신러닝/svm/archive/fashion-mnist_public_test.csv')
public_data = public.drop('label', axis=1)
public_labels = public['label']
public_data = public_data.astype('float32')

# Preprocessing, PCA

In [5]:
#normalization
from sklearn.preprocessing import Normalizer

norm = Normalizer()
normalization_train1_data = norm.fit_transform(train1_data)
normalization_train0_data = norm.transform(train0_data)
normalization_train2_data = norm.transform(train2_data)
normalization_test_data = norm.transform(test_data)

pca = PCA(n_components=400)
pca.fit(normalization_train1_data)

pca_norm_train1_data = pca.transform(normalization_train1_data)
pca_norm_train2_data = pca.transform(normalization_train2_data)
pca_norm_test_data = pca.transform(normalization_test_data)

# Train

In [None]:
# main model train
svc = SVC(gamma='scale',kernel='rbf',C=10)
svc.fit(pca_norm_train1_data,train1_labels)

# inference    
svc_preds1 = svc.predict(pca_norm_test_data)

In [7]:
# sub model train
svc = SVC(gamma='scale',kernel='rbf',C=10)
svc.fit(pca_norm_train2_data,train2_labels)

# inference    
svc_preds2 = svc.predict(pca_norm_test_data)

# Overwrite Ensemble

In [8]:
for i in range(len(svc_preds1)) :
    if svc_preds2[i] == 0 or svc_preds2[i] == 4 or svc_preds2[i] == 6 :
        svc_preds1[i] = svc_preds2[i]

# Inference

In [10]:
private_preds_dict = {image : label for image, label in enumerate(svc_preds1)}

with open('./final_private.txt', 'w') as file :
    for image, label in private_preds_dict.items():
        file.write(f'{image:05d} {label}\n')