# Feature Extraction - t-SNE 

#### 참고 - https://woosikyang.github.io/first-post.html

1. 적절한 이웃 반경 설정하기 - 원하는 수준의 엔트로피에 맞춰서
- $x_i$ 는 원래 차원에서 가우시안 분포에 따라 분포한다 했을 때, 표준편차의 크기에 따라 $x_j$ 를 유의미한 이웃으로 고려할 수도 아닐 수도 있음
  
  - <=> radius(=표준편차)에 $p_{j|i}$ 값과 엔트로피는 비례한다
- 따라서 우리가 원하는 엔트로피 정도에 따라 반경(radius)를 계산한다.


2. 저차원 표현에 대한 비용 함수 - Kullback Leiber divergence - 설정하여, gradient 값 구하기 

3. $y_i$에 대한 gradient 값을 기반으로 gradient descent 방법을 통해 y값 근접시키기. 

In [2]:
# 데이터 사용 및 라이브러리 설치 

import numpy as np
import pandas as pd
import random
import heapq

from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression 
from collections import defaultdict

boston = load_boston()
X = boston.data 
y = boston.target
model = LinearRegression()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

### 1. 적절한 이웃 반경 설정하기 

**구현해야하는 것** 
- $p_{j|i}$ : 원래 차원(D)에서 객체 i가 j를 이웃으로 선택할 확률
> $\frac{e^{-\frac{||x_i - x_j||^2}{2 \sigma_i^2}}}{\sum_{k \neq i} e^{-\frac{||x_i - x_j||^2}{2 \sigma_i^2}}}$

- $p_{ij}$ : $\frac{p_{i|j} + p_{j|i}}{2n}$

- $q_{j|i}$ : 축소된 차원(d)에서 객체 i가 j를 이웃으로 선택할 확율 
> $\frac{(1+ ||y_i - y_j||^2)^{-1}}{\sum_{k \neq i}(1+ ||y_i - y_j||^2)^{-1}}$

- $h(P_i)$ : 엔트로피. 
> $\sum p_{j|i}\log_2 p_{i|j}$ 

- $per(P_i)$ : 복잡도. 
> $2^{h(P_i)}$ 

- 적정한 $\sigma_i$ 값을 찾는 함수 


**필요한 것**
- X : 입력 데이터 
- y : 랜덤한 데이터
- n : 샘플 개수 

**함수의 형태**
- def __init__(self, X,s)  

- def softmax(self, vector) :=>  softmax 값을 가진 list

- def find_sigma(self, matrix, target_per) : => 적정한 sigma list
> 필요한 것 : binary_search 
- def binary_search(self,fn, target, tol=1e-10, max_iter=10000, lower=1e-20, upper=1000.):
> 필요한 것 : fn = per 함수

- def per(self, vector, s) : =>  $per(P_i)$
> 필요한 것 : p 함수 

- def p(self, vector, s) : => 한 행의 $p_{j|i}$

- def p_matrix(self) : =>  P 매트릭스 

- def new_p(self) : => $p_{ij}$ => $p_{ij}$ 로 조정된 매트릭스 

- def q(self) : => $q_{j|i}$ 매트릭스 




In [3]:
a = [[1,2,3], [4,5,6], [7,8,9]]
np.sum(a, 1)


array([ 6, 15, 24])

In [19]:
class tSNE() : 
    def __init__(self, X, target_p): 
        self.X = X
        self.n = np.shape(X)[0]
        self.y = np.random.rand(self.n, self.n)
        self.target_p = target_p # 목표 복잡도 
        self.s = self.find_sigma(self.X, self.target_p)
        
    def softmax(self, vector) :
        vector = np.array(vector)
        return np.exp(vector) / np.exp(vector).sum() 
    
    def p(self, vector, s) : # => 한 벡터만 값을 반환해야 하나?  
        vector = [(np.square(vector -self.X[i])) for i in range(self.n)] 
        vector = vector / (-2*s**2) 
        s_vector = self.softmax(vector)
        return np.array(s_vector) 
    
    def p_matrix(self) : 
        p_matrix = [] 
        for i in range(self.n) : 
            vector = self.p(self.X[i], self.s[i])
            p_matrix.append(vector)
        return np.array(p_matrix)
        
    def q_matrix(self) : 
        q_matrix = [] 
        for i in range(self.n) : 
            vector = np.array([-np.sum(np.square(self.y[i] - self.y[j])) for j in range(self.n)])
            s_vector = self.softmax(vector)
            q_matrix.append(s_vector)
        return np.array(q_matrix)
    
    def new_p_matrix(self) : 
        matrix = self.p_matrix() 
        return (matrix + matrix.T)/2
    
    def per(self, vector, s) : # vector에는 추후 p,q matrix의 각 행을 넣을 것 
        vector = self.p(vector, s)
        entropy = np.sum([vector[i]*np.log2(vector)[i] for i in range(vector.shape[1])])
        return 2**entropy 
    
    def find_sigma(self, matrix, target_per) : # 적정한 sigma list 반환 
        sigmas = []
        for i in range(matrix.shape[0]) : 
            fn = lambda s : self.per(matrix[i:i+1, :],np.array(s))
            correct_sigma = binary_search(fn, target_per) 
            sigmas.append(correct_sigma)
        return np.array(sigmas) 
        
    def binary_search(self, fn, target, tol=1e-10, max_iter=10000, lower=1e-20, upper=1000.):
        for i in range(max_iter):
            guess = (lower + upper) / 2.
            val = fn(guess)
            if val > target:
                upper = guess
            else:
                lower = guess
            if np.abs(val - target) <= tol:
                break
    
    
        
    

In [20]:
test = tSNE(X,20) 

test.p_matrix()


KeyboardInterrupt: 

### 2. 저차원 표현에 대한 비용 함수 - Kullback Leiber divergence - 설정하여, gradient 값 구하기 

### 3. $y_i$에 대한 gradient 값을 기반으로 gradient descent 방법을 통해 y값 근접시키기. 

**구현해야하는 것** 
- Kullback Leiber 함수 
- $\frac {\delta C}{\delta y_i}$ 

**필요한 것**
- l_rate : 학습율 
- p matrix 
- q matrix 
- y 값 

**함수의 형태**
- def kullback(self, p_matrix, q_matrix) : => 단일 값 반환  

- def gradient(self) : => (n x 1) 벡터 반환

- def tsne_goal(self) : => y matrix 반환 
> 비용함수를 


In [24]:
class tSNE() : 
    def __init__(self, X, target_p, d, learing_rate): 
        self.X = X
        self.n = np.shape(X)[0]
        self.y = np.random.rand(self.n, self.d)
        
        self.l = learning_rate
        self.target_p = target_p # 목표 복잡도 
        self.s = self.find_sigma(self.X, self.target_p)
        
    def softmax(self, vector) :
        vector = np.array(vector)
        return np.exp(vector) / np.exp(vector).sum() 
    
    def p(self, vector, s) : # => 한 벡터만 값을 반환해야 하나?  
        vector = [(np.square(vector -self.X[i])) for i in range(self.n)] 
        vector = vector / (-2*s**2) 
        s_vector = self.softmax(vector)
        return np.array(s_vector) 
    
    def p_matrix(self) : 
        p_matrix = [] 
        for i in range(self.n) : 
            vector = self.p(self.X[i], self.s[i])
            p_matrix.append(vector)
        return np.array(p_matrix)
        
    def q_matrix(self) : 
        q_matrix = [] 
        for i in range(self.n) : 
            vector = np.array([-np.sum(np.square(self.y[i] - self.y[j])) for j in range(self.n)])
            s_vector = self.softmax(vector)
            q_matrix.append(s_vector)
        return np.array(q_matrix)
    
    def new_p_matrix(self) : 
        matrix = self.p_matrix() 
        return (matrix + matrix.T)/2
    
    def per(self, vector, s) : # vector에는 추후 p,q matrix의 각 행을 넣을 것 
        vector = self.p(vector, s)
        entropy = np.sum([vector[i]*np.log2(vector)[i] for i in range(vector.shape[1])])
        return 2**entropy 
    
    def find_sigma(self, matrix, target_per) : # 적정한 sigma list 반환 
        sigmas = []
        for i in range(matrix.shape[0]) : 
            fn = lambda s : self.per(matrix[i:i+1, :],np.array(s))
            correct_sigma = binary_search(fn, target_per) 
            sigmas.append(correct_sigma)
        return np.array(sigmas) 
        
    def binary_search(self, fn, target, tol=1e-10, max_iter=10000, lower=1e-20, upper=1000.):
        for i in range(max_iter):
            guess = (lower + upper) / 2.
            val = fn(guess)
            if val > target:
                upper = guess
            else:
                lower = guess
            if np.abs(val - target) <= tol:
                break
    
    def kullback(self, new_p_matrix, q_matrix) : 
        return np.sum(new_p_matrix * np.log(new_p_matrix / q_matrix)) 
    
    def gradient(self, new_p_matrix, q_matrix) : 
        gradient_lst = [] 
        for i in range(self.n ) : 
            gradient = np.sum(4*[np.dot((self.y[i]-self.y[j]), (new_p_matrix[i,j] - q_matrix[i,j])) / (1 + np.square(self.y[i] -self.y[j])) for j in range(len(new_p_matrix))]) 
            gradient_lst.append(gradient)
        return gradient_lst
    
    def tsne_goal(self, num_iter) : 
        new_p_matrix = self.new_p_matrix()
        
        for i in range(num_iter) : 
            q_matrix = self.q_matrix() 
            gradient_vector = self.gradient(new_p_matrix, q_matrix)
            self.y = self.y + self.l*gradient_vector.T
        return self.y
                              
            

In [23]:
a = np.array([1,2,3])
b= np.array([2,3,4]) 

np.dot(a,b)

20

In [22]:
a = np.array(a)
a.sum()

45