In [1]:
# 구글드라이브를 /content/gdrive/'My Drive'에 마운트

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# 마운트된 구글 드라이브로 이동

%cd /content/gdrive/MyDrive/'ml-term-prj'

/content/gdrive/MyDrive/ml-term-prj


In [3]:
# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# sklearn
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits # 8x8 image
# numpy
import numpy as np
# matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [4]:
import pandas as pd
import numpy as np

In [5]:
# train data
train = pd.read_csv('data/train_data/train_task_3_4.csv')
print(train.head(1))
print('-' * 50 + '\n')
print(train.describe())
print('-' * 50+ '\n')
print(train.info())

   QuestionId  UserId  AnswerId  IsCorrect  CorrectAnswer  AnswerValue
0         898    2111    280203          1              2            2
--------------------------------------------------

         QuestionId        UserId      AnswerId     IsCorrect  CorrectAnswer  \
count  1.382727e+06  1.382727e+06  1.382727e+06  1.382727e+06   1.382727e+06   
mean   4.682276e+02  3.036283e+03  7.544279e+05  5.373317e-01   2.390497e+00   
std    2.735797e+02  1.770599e+03  4.356190e+05  4.986046e-01   1.066479e+00   
min    0.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00   1.000000e+00   
25%    2.330000e+02  1.515000e+03  3.772935e+05  0.000000e+00   1.000000e+00   
50%    4.680000e+02  3.009000e+03  7.544530e+05  1.000000e+00   2.000000e+00   
75%    7.030000e+02  4.565000e+03  1.131772e+06  1.000000e+00   3.000000e+00   
max    9.470000e+02  6.147000e+03  1.508916e+06  1.000000e+00   4.000000e+00   

        AnswerValue  
count  1.382727e+06  
mean   2.437992e+00  
std    1.067624e+00

In [6]:
# validation set
val = pd.read_csv('data/test_data/quality_response_remapped_public.csv')
val.head(1)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,left,right,T1_ALR,T2_CL,T3_GF,T4_MQ,T5_NS
0,0,0,909,318,2,2,2,1.0,2


In [7]:
# test set
test = pd.read_csv('data/test_data/quality_response_remapped_private.csv')
test.head(1)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,left,right,T1_ALR,T2_CL,T3_GF,T4_MQ,T5_NS
0,15,15,145,762,2,2,2,2.0,2


In [8]:
# calc good question from val&test set
from collections import defaultdict
labels = pd.concat([val, test], ignore_index=True, axis=0)
qid = defaultdict(int)
def f(row):
    qids = [row['left'], row['right']]
    qids[row['T1_ALR']-1]
    qid[qids[row['T1_ALR']-1]] += 1
    qid[qids[row['T2_CL']-1]] += 1
    qid[qids[row['T3_GF']-1]] += 1
    qid[qids[row['T4_MQ']-1]] += 1
    qid[qids[row['T5_NS']-1]] += 1
labels = labels.dropna()
labels = labels.astype(dtype=np.int32)
labels.apply(f, axis=1)
qid

defaultdict(int,
            {318: 4,
             909: 1,
             609: 5,
             132: 2,
             271: 3,
             127: 4,
             704: 1,
             16: 2,
             586: 3,
             865: 4,
             56: 1,
             233: 4,
             478: 1,
             620: 5,
             246: 4,
             733: 1,
             668: 5,
             4: 3,
             691: 2,
             830: 4,
             403: 1,
             27: 3,
             404: 2,
             202: 1,
             761: 4,
             555: 5,
             876: 4,
             527: 1,
             558: 5,
             316: 1,
             121: 4,
             780: 4,
             363: 1,
             118: 5,
             676: 4,
             153: 6,
             129: 5,
             721: 3,
             327: 2,
             139: 2,
             158: 3,
             547: 5,
             762: 5,
             307: 4,
             208: 1,
             278: 5,
             100: 1,
 

In [9]:
# select best question
max_ = max(qid.values())
best_qid = None
for item in qid.items():
    if item[1] == max_:
        best_qid = item[0]

In [10]:
# calc percent of correct for each question
corr_rate = pd.DataFrame({'QuestionId':[], 'response_num':[], 'correct_rate':[]})
for group_label, group_df in train.groupby('QuestionId'):
    row = pd.DataFrame({'QuestionId':[group_label], 'response_num':[len(group_df)], 'correct_rate':[group_df['IsCorrect'].mean()]})
    corr_rate = pd.concat([corr_rate, row], ignore_index=True)
corr_rate = corr_rate.sort_values(by=['correct_rate'])
corr_rate

Unnamed: 0,QuestionId,response_num,correct_rate
944,944.0,189.0,0.142857
931,931.0,1702.0,0.160400
155,155.0,1675.0,0.164776
425,425.0,78.0,0.179487
718,718.0,658.0,0.183891
...,...,...,...
422,422.0,144.0,0.895833
924,924.0,2379.0,0.923497
841,841.0,4.0,1.000000
660,660.0,6.0,1.000000


In [11]:
from torch.nn.modules.activation import ReLU
import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import random
# GPU 사용 가능하면 사용하고 아니면 CPU 사용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# for reproducibility
random.seed(777)
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)
# define network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)
        )

    def forward(self, x):
        x = self.conv1(x) # conv
        x = torch.mean(x.view(x.size(0), -1), dim=1) # GAP
        return x

net = Net().to(device)

In [12]:
import cv2
for gl, gdf in corr_rate.groupby('QuestionId'):
    # imread
    q_img = cv2.imread('data/images/' + str(int(gl)) + '.jpg')
    q_img = cv2.cvtColor(q_img, cv2.COLOR_BGR2RGB)
    q_img = np.transpose(q_img, (2,0,1)).astype(np.float32)
    # convert img to vector using cnn
    t = torch.from_numpy(q_img).to(device)
    vector = net(t)
    vector = vector.to('cpu').detach().numpy().astype(np.float32)
    corr_rate.loc[corr_rate['QuestionId']==gl, 'img'] = vector.mean()

In [63]:
corr_rate.to_csv('corr_rate.csv')
corr_rate

Unnamed: 0,QuestionId,response_num,correct_rate,img
0,944.0,189.0,0.142857,65.142624
1,931.0,1702.0,0.160400,64.713997
2,155.0,1675.0,0.164776,65.219406
3,425.0,78.0,0.179487,65.588799
4,718.0,658.0,0.183891,65.472786
...,...,...,...,...
943,422.0,144.0,0.895833,65.056740
944,924.0,2379.0,0.923497,65.048607
945,841.0,4.0,1.000000,65.226456
946,660.0,6.0,1.000000,65.319534


In [90]:
#
corr_rate = pd.read_csv('corr_rate.csv')
corr_rate = corr_rate.drop(columns=['Unnamed: 0'])
corr_rate

Unnamed: 0,QuestionId,response_num,correct_rate,img
0,944.0,189.0,0.142857,65.142624
1,931.0,1702.0,0.160400,64.713997
2,155.0,1675.0,0.164776,65.219406
3,425.0,78.0,0.179487,65.588799
4,718.0,658.0,0.183891,65.472786
...,...,...,...,...
943,422.0,144.0,0.895833,65.056740
944,924.0,2379.0,0.923497,65.048607
945,841.0,4.0,1.000000,65.226456
946,660.0,6.0,1.000000,65.319534


In [91]:
best_q = corr_rate[corr_rate['QuestionId'] == best_qid]
best_q

Unnamed: 0,QuestionId,response_num,correct_rate,img
878,153.0,4.0,0.75,65.096687


In [92]:
def f(row):
    return abs(best_q['img'] - row['img'])
# corr_rate = corr_rate.astype(dtype=np.int32)
distances = corr_rate.apply(f, axis=1)
corr_rate = pd.concat([corr_rate, distances], axis=1)
corr_rate = corr_rate.rename(columns={878:'distance'})
corr_rate = corr_rate.sort_values(by=['distance'])
corr_rate = corr_rate.reset_index()
corr_rate

Unnamed: 0,index,QuestionId,response_num,correct_rate,img,distance
0,878,153.0,4.0,0.750000,65.096687,0.000000
1,341,55.0,1487.0,0.434432,65.096703,0.000015
2,732,585.0,2546.0,0.632364,65.097519,0.000832
3,15,752.0,104.0,0.230769,65.097595,0.000908
4,202,504.0,115.0,0.373913,65.095695,0.000992
...,...,...,...,...,...,...
943,540,898.0,2162.0,0.526364,62.184769,2.911919
944,671,781.0,2597.0,0.594147,62.183212,2.913475
945,572,389.0,2504.0,0.541134,62.181999,2.914688
946,303,895.0,167.0,0.419162,62.050354,3.046333
