In [1]:
# !pip install ftfy regex
# !pip install git+https://github.com/openai/CLIP.git

In [2]:
## reproducibility를 위해 random seed 고정
import torch
import random
import torch.backends.cudnn as cudnn
import numpy as np

torch.manual_seed(256)
torch.cuda.manual_seed(256)
torch.cuda.manual_seed_all(256)
np.random.seed(0)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(0)

In [3]:
import os
import clip
import torch
from torchvision import datasets
from sklearn.metrics import accuracy_score

In [4]:
Dataset = [datasets.CIFAR10]

In [5]:
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from tqdm import tqdm

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
ACC_2 = []

def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=8)):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Load the dataset
root = os.path.expanduser("~/.data")
for _ in Dataset:
  try:
    train = _(root, download=True, train=True, transform=preprocess)
    test = _(root, download=True, train=False, transform=preprocess)
  except:
    train = _(root, download=True, split='train', transform=preprocess)
    test = _(root, download=True, split='test', transform=preprocess)

  # Calculate the image features
  train_features, train_labels = get_features(train)
  print(train_features)
  print(train_labels)
  test_features, test_labels = get_features(test)

  # Perform logistic regression
  classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
  classifier.fit(train_features, train_labels)

  # Evaluate using the logistic regression classifier
  predictions = classifier.predict(test_features)
  accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
  ACC_2.append(f'{train.__class__.__name__} : {accuracy}')
  print(f"Accuracy = {accuracy:.3f}")

Files already downloaded and verified
Files already downloaded and verified


100%|███████████████████████████████████████| 6250/6250 [01:50<00:00, 56.38it/s]


[[-0.1821   0.0773  -0.269   ...  1.104   -0.1808  -0.274  ]
 [ 0.04443  0.0546  -0.12366 ...  0.951    0.10187  0.1378 ]
 [ 0.3115  -0.06003 -0.3264  ...  0.5947  -0.1338   0.0629 ]
 ...
 [ 0.2439   0.03134 -0.2125  ...  0.5327  -0.01223 -0.01997]
 [ 0.3125   0.2238  -0.1232  ...  1.244    0.2986   0.02827]
 [ 0.04782  0.203   -0.2725  ...  0.3862   0.00713  0.1323 ]]
[6 9 9 ... 9 1 1]


100%|███████████████████████████████████████| 1250/1250 [00:21<00:00, 57.51it/s]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy = 94.990


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   43.7s finished
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.


In [51]:
import pandas as pd
df = pd.DataFrame(train_features)

In [52]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.182129,0.077271,-0.269043,-0.252197,0.340820,-0.206543,-0.022476,0.421875,-0.105896,0.204956,...,-0.348633,0.313965,0.542969,-0.122986,0.279785,0.118042,0.339600,1.103516,-0.180786,-0.273926
1,0.044434,0.054596,-0.123657,-0.042114,0.639648,0.252686,0.126221,0.460693,-0.091492,0.153687,...,0.209473,0.604492,-0.056641,-0.043518,0.264893,0.195557,0.008942,0.951172,0.101868,0.137817
2,0.311523,-0.060028,-0.326416,0.091553,0.375732,-0.092041,0.256348,0.528809,-0.168701,0.213379,...,0.182617,0.120544,0.006985,-0.364014,0.167480,0.406738,0.063354,0.594727,-0.133789,0.062927
3,0.075439,-0.199829,-0.111267,0.022995,0.520508,0.013527,0.333008,0.644531,-0.185059,0.332031,...,0.040955,-0.085754,0.267578,-0.140869,0.055328,0.227295,0.216309,0.521973,0.082581,0.078857
4,0.144287,0.273682,-0.015129,0.013245,-0.019623,0.052643,0.367676,0.298828,0.063599,-0.234131,...,-0.232788,0.072144,0.181152,-0.333008,0.103577,0.120300,0.172729,0.808594,-0.005161,0.482422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.384521,0.001780,0.010330,-0.373535,0.003527,-0.173828,0.265137,0.809570,-0.310791,0.321533,...,0.010033,0.174927,0.914551,-0.247437,0.102600,0.454102,0.049683,0.587891,0.343506,-0.193848
49996,0.326660,-0.057800,-0.433838,0.026611,0.405273,-0.307129,0.231445,0.496826,-0.186401,0.021179,...,0.096069,0.017242,0.181396,-0.292480,0.116516,0.266357,-0.109070,1.097656,0.289062,0.182129
49997,0.243896,0.031342,-0.212524,-0.059174,0.381104,0.453125,0.387207,0.718262,-0.110718,0.239990,...,0.255127,0.346436,0.281494,0.340820,0.271729,0.133545,-0.217529,0.532715,-0.012230,-0.019974
49998,0.312500,0.223755,-0.123230,-0.211792,-0.057159,0.066162,0.348145,0.208618,-0.187622,-0.088989,...,-0.075317,0.137817,0.528809,0.020004,0.259521,0.089355,-0.134033,1.244141,0.298584,0.028275


In [59]:
val = []
for i in tqdm(range(len(train_features))):
    val.append(int(np.sum(train_features[i])))

100%|█████████████████████████████████| 50000/50000 [00:00<00:00, 168857.03it/s]


In [60]:
len(val)

50000

In [61]:
df['label'] = train_labels
df['sum'] = val

In [62]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,504,505,506,507,508,509,510,511,label,sum
0,-0.182129,0.077271,-0.269043,-0.252197,0.340820,-0.206543,-0.022476,0.421875,-0.105896,0.204956,...,0.542969,-0.122986,0.279785,0.118042,0.339600,1.103516,-0.180786,-0.273926,6,0
1,0.044434,0.054596,-0.123657,-0.042114,0.639648,0.252686,0.126221,0.460693,-0.091492,0.153687,...,-0.056641,-0.043518,0.264893,0.195557,0.008942,0.951172,0.101868,0.137817,9,15
2,0.311523,-0.060028,-0.326416,0.091553,0.375732,-0.092041,0.256348,0.528809,-0.168701,0.213379,...,0.006985,-0.364014,0.167480,0.406738,0.063354,0.594727,-0.133789,0.062927,9,-6
3,0.075439,-0.199829,-0.111267,0.022995,0.520508,0.013527,0.333008,0.644531,-0.185059,0.332031,...,0.267578,-0.140869,0.055328,0.227295,0.216309,0.521973,0.082581,0.078857,4,2
4,0.144287,0.273682,-0.015129,0.013245,-0.019623,0.052643,0.367676,0.298828,0.063599,-0.234131,...,0.181152,-0.333008,0.103577,0.120300,0.172729,0.808594,-0.005161,0.482422,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.384521,0.001780,0.010330,-0.373535,0.003527,-0.173828,0.265137,0.809570,-0.310791,0.321533,...,0.914551,-0.247437,0.102600,0.454102,0.049683,0.587891,0.343506,-0.193848,2,-15
49996,0.326660,-0.057800,-0.433838,0.026611,0.405273,-0.307129,0.231445,0.496826,-0.186401,0.021179,...,0.181396,-0.292480,0.116516,0.266357,-0.109070,1.097656,0.289062,0.182129,6,-10
49997,0.243896,0.031342,-0.212524,-0.059174,0.381104,0.453125,0.387207,0.718262,-0.110718,0.239990,...,0.281494,0.340820,0.271729,0.133545,-0.217529,0.532715,-0.012230,-0.019974,9,-5
49998,0.312500,0.223755,-0.123230,-0.211792,-0.057159,0.066162,0.348145,0.208618,-0.187622,-0.088989,...,0.528809,0.020004,0.259521,0.089355,-0.134033,1.244141,0.298584,0.028275,1,0


In [63]:
df.to_csv('20221130-train.csv')

In [64]:
import pandas as pd
df1 = pd.DataFrame(test_features)

In [65]:
val1 = []
for i in tqdm(range(len(test_features))):
    val1.append(int(np.sum(test_features[i])))

100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 130073.68it/s]


In [66]:
df1['label'] = test_labels
df1['sum'] = val1

In [70]:
len(test_labels)

10000

In [67]:
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,504,505,506,507,508,509,510,511,label,sum
0,0.234375,0.049835,-0.447998,-0.366211,0.076965,-0.219971,0.087219,0.945801,-0.408447,0.227539,...,0.521484,0.003767,0.355469,0.115234,-0.122864,0.534668,0.031082,0.232300,3,-3
1,0.481201,-0.031555,-0.068237,0.057007,0.151367,0.004078,0.489746,0.491943,0.161865,-0.114319,...,0.099182,-0.289062,0.007778,0.180420,0.270264,0.548828,-0.032654,-0.254150,8,3
2,0.143921,-0.001245,-0.244873,-0.132568,0.137695,0.089905,0.239380,0.753906,0.271240,0.041779,...,0.676758,0.124756,-0.083435,0.391113,0.112610,0.613770,0.012512,0.050690,8,0
3,0.363281,-0.215210,-0.086853,-0.565918,0.644043,-0.493164,0.090454,0.178345,-0.345947,0.257812,...,0.751465,-0.644043,0.096802,0.248657,0.049011,0.731934,-0.446045,-0.159790,0,-9
4,-0.024628,-0.201416,0.009354,-0.221313,-0.304443,0.156494,0.045044,0.273193,0.558594,0.360107,...,0.398438,-0.527344,0.247437,-0.036865,0.348633,0.628418,0.016861,-0.250244,6,-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.260986,0.012497,-0.256592,-0.437500,0.389160,-0.120605,0.325195,1.038086,-0.139893,0.066467,...,0.389893,-0.068420,0.320068,0.111633,0.011375,0.780762,-0.031891,0.270264,8,1
9996,0.180542,0.098511,-0.240723,0.151611,0.093933,-0.527832,0.167969,0.541016,0.087219,0.015373,...,0.459961,0.112549,0.020508,0.362305,-0.195801,0.966797,-0.016418,0.047699,3,-1
9997,0.005203,-0.169312,-0.412354,0.220215,0.468018,-0.075745,0.370117,0.578125,0.444336,0.109009,...,0.134277,0.187622,-0.135010,0.439453,-0.377197,0.774414,0.236328,0.064209,5,0
9998,0.500977,0.154541,-0.029984,-0.383789,0.081177,0.244507,0.395020,0.761230,-0.185425,0.252930,...,0.337158,-0.393799,0.098633,-0.054413,0.252930,0.390869,-0.014481,-0.016296,1,-5


In [71]:
df1.to_csv('20221130-test.csv')

In [7]:
train_0 = []
for i in range(len(train_labels)):
    if train_labels[i] == 0:
        train_0.append(train_features[i])

In [8]:
train_1 = []
for i in range(len(train_labels)):
    if train_labels[i] == 1:
        train_1.append(train_features[i])

In [9]:
train_2 = []
for i in range(len(train_labels)):
    if train_labels[i] == 2:
        train_2.append(train_features[i])

In [10]:
train_3 = []
for i in range(len(train_labels)):
    if train_labels[i] == 3:
        train_3.append(train_features[i])

In [11]:
train_4 = []
for i in range(len(train_labels)):
    if train_labels[i] == 4:
        train_4.append(train_features[i])

In [12]:
train_5 = []
for i in range(len(train_labels)):
    if train_labels[i] == 5:
        train_5.append(train_features[i])

In [13]:
train_6 = []
for i in range(len(train_labels)):
    if train_labels[i] == 6:
        train_6.append(train_features[i])

In [14]:
train_7 = []
for i in range(len(train_labels)):
    if train_labels[i] == 7:
        train_7.append(train_features[i])

In [15]:
train_8 = []
for i in range(len(train_labels)):
    if train_labels[i] == 8:
        train_8.append(train_features[i])

In [16]:
train_9 = []
for i in range(len(train_labels)):
    if train_labels[i] == 9:
        train_9.append(train_features[i])

In [17]:
len(train_9)

5000

In [18]:
def cosine_similarity(a, b):
    a_norm = np.linalg.norm(a)
    b_norm = np.linalg.norm(b)
    a_b_dot = np.inner(a, b)
    return np.mean(a_b_dot / (a_norm * b_norm))

In [19]:
def f_norm(a, b):
    new = a - b
    val = np.square(new)
    return np.mean(np.sqrt(val))

In [20]:
train_0_score = []
for i in tqdm(range(len(train_0))):
    temp = []
    for j in range(len(train_0)):
        temp.append(cosine_similarity(train_0[i], train_0[j]) + f_norm(train_0[i], train_0[j]))
    train_0_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:27<00:00,  3.27it/s]


In [21]:
train_1_score = []
for i in tqdm(range(len(train_1))):
    temp = []
    for j in range(len(train_1)):
        temp.append(cosine_similarity(train_1[i], train_1[j]) + f_norm(train_1[i], train_1[j]))
    train_1_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:26<00:00,  3.28it/s]


In [22]:
train_2_score = []
for i in tqdm(range(len(train_2))):
    temp = []
    for j in range(len(train_2)):
        temp.append(cosine_similarity(train_2[i], train_2[j]) + f_norm(train_2[i], train_2[j]))
    train_2_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:24<00:00,  3.28it/s]


In [23]:
train_3_score = []
for i in tqdm(range(len(train_3))):
    temp = []
    for j in range(len(train_3)):
        temp.append(cosine_similarity(train_3[i], train_3[j]) + f_norm(train_3[i], train_3[j]))
    train_3_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:25<00:00,  3.28it/s]


In [24]:
train_4_score = []
for i in tqdm(range(len(train_4))):
    temp = []
    for j in range(len(train_4)):
        temp.append(cosine_similarity(train_4[i], train_4[j]) + f_norm(train_4[i], train_4[j]))
    train_4_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:10<00:00,  3.31it/s]


In [25]:
train_5_score = []
for i in tqdm(range(len(train_5))):
    temp = []
    for j in range(len(train_5)):
        temp.append(cosine_similarity(train_5[i], train_5[j]) + f_norm(train_5[i], train_5[j]))
    train_5_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:03<00:00,  3.32it/s]


In [26]:
train_6_score = []
for i in tqdm(range(len(train_6))):
    temp = []
    for j in range(len(train_6)):
        temp.append(cosine_similarity(train_6[i], train_6[j]) + f_norm(train_6[i], train_6[j]))
    train_6_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:19<00:00,  3.29it/s]


In [27]:
train_7_score = []
for i in tqdm(range(len(train_7))):
    temp = []
    for j in range(len(train_7)):
        temp.append(cosine_similarity(train_7[i], train_7[j]) + f_norm(train_7[i], train_7[j]))
    train_7_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:29<00:00,  3.27it/s]


In [28]:
train_8_score = []
for i in tqdm(range(len(train_8))):
    temp = []
    for j in range(len(train_8)):
        temp.append(cosine_similarity(train_8[i], train_8[j]) + f_norm(train_8[i], train_8[j]))
    train_8_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:25<00:00,  3.28it/s]


In [29]:
train_9_score = []
for i in tqdm(range(len(train_9))):
    temp = []
    for j in range(len(train_9)):
        temp.append(cosine_similarity(train_9[i], train_9[j]) + f_norm(train_9[i], train_9[j]))
    train_9_score.append(np.mean(temp))

100%|███████████████████████████████████████| 5000/5000 [25:37<00:00,  3.25it/s]


In [30]:
train_0_val = []
for i in tqdm(range(len(train_0))):
    train_0_val.append(np.sum(train_0[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 158943.79it/s]


In [31]:
train_1_val = []
for i in tqdm(range(len(train_1))):
    train_1_val.append(np.sum(train_1[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 155793.51it/s]


In [32]:
train_2_val = []
for i in tqdm(range(len(train_2))):
    train_2_val.append(np.sum(train_2[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 172882.57it/s]


In [33]:
train_3_val = []
for i in tqdm(range(len(train_3))):
    train_3_val.append(np.sum(train_3[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 178067.30it/s]


In [34]:
train_4_val = []
for i in tqdm(range(len(train_4))):
    train_4_val.append(np.sum(train_4[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 183707.70it/s]


In [35]:
train_5_val = []
for i in tqdm(range(len(train_5))):
    train_5_val.append(np.sum(train_5[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 195327.38it/s]


In [36]:
train_6_val = []
for i in tqdm(range(len(train_6))):
    train_6_val.append(np.sum(train_6[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 177145.27it/s]


In [37]:
train_7_val = []
for i in tqdm(range(len(train_7))):
    train_7_val.append(np.sum(train_7[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 175654.11it/s]


In [38]:
train_8_val = []
for i in tqdm(range(len(train_8))):
    train_8_val.append(np.sum(train_8[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 181518.17it/s]


In [39]:
train_9_val = []
for i in tqdm(range(len(train_9))):
    train_9_val.append(np.sum(train_9[i]))

100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 185357.39it/s]
