<h3 style = "font-family: Cambria;"> Data </h3>

In [1]:
import pandas, numpy, time

data = pandas.read_csv("Data.csv", index_col = "Ticker")

data = data.dropna()

print(data)

           Inventory  Working Capital  Total Assets  Current Assets  \
Ticker                                                                
A       8.626667e+08     1.985333e+09  1.028800e+10    3.664000e+09   
AA      1.927000e+09     1.936000e+09  1.488033e+10    4.932000e+09   
AAL     1.896000e+09    -4.457000e+09  6.439700e+10    1.456667e+10   
AAN     7.530493e+08     5.378673e+08  1.551062e+09    8.538473e+08   
AAON    1.371427e+08     1.653597e+08  6.376970e+08    2.624823e+08   
...              ...              ...           ...             ...   
ZIM     1.206457e+08     1.329239e+09  8.097114e+09    3.519343e+09   
ZTO     5.885600e+07     8.905596e+09  6.683356e+10    2.182200e+10   
ZTS     1.965333e+09     4.637667e+09  1.414467e+10    7.015667e+09   
ZWS     2.895667e+08     5.038000e+08  2.522933e+09    8.322000e+08   
ZYXI    1.095833e+07     5.373233e+07  1.073050e+08    7.175700e+07   

        Current Liabilities  Cash And Cash Equivalents    Total Debt  \
Tick

<h3 style = "font-family: Cambria;"> Normalization </h3>

In [2]:
from sklearn.preprocessing import MinMaxScaler

train = MinMaxScaler().fit_transform(data)

train = pandas.DataFrame(train, index = data.index)

print(train)

              0         1             2         3             4   \
Ticker                                                             
A       0.000102  0.489336  3.866041e-05  0.000067  2.423393e-05   
AA      0.000228  0.489335  5.594979e-05  0.000091  4.328614e-05   
AAL     0.000225  0.489124  2.423718e-04  0.000268  2.750896e-04   
AAN     0.000089  0.489289  5.767295e-06  0.000016  4.525782e-06   
AAON    0.000016  0.489276  2.328629e-06  0.000005  1.360512e-06   
...          ...       ...           ...       ...           ...   
ZIM     0.000014  0.489315  3.041209e-05  0.000065  3.163069e-05   
ZTO     0.000007  0.489564  2.515450e-04  0.000402  1.867621e-04   
ZTS     0.000233  0.489424  5.318012e-05  0.000129  3.434818e-05   
ZWS     0.000034  0.489287  9.426230e-06  0.000015  4.705409e-06   
ZYXI    0.000001  0.489273  3.317910e-07  0.000001  2.165405e-07   

                  5             6         7             8             9   \
Ticker                                 

<h3 style = "font-family: Cambria;"> Frank-wolfe Algorithm </h3>

In [3]:
def matrix_y(mat_xtx, mat_xxz, mat_zzt, mat_y):

    index = numpy.arange(mat_y.shape[1])

    for i in range(100):

        gamma = 2 * ((i + 2) ** -1)
        
        mat_g = 2 * (numpy.dot(numpy.dot(mat_xtx, mat_y), mat_zzt) - mat_xxz) - numpy.log(mat_y, where = mat_y > 0)

        argmn = numpy.argmin(mat_g, axis = 0)

        mat_y = mat_y * (1 - gamma)

        mat_y[argmn,index] = mat_y[argmn,index] + gamma
        
    return mat_y

In [4]:
def matrix_z(mat_mtm, mat_mtx, mat_z):

    index = numpy.arange(mat_z.shape[1])

    for i in range(100):

        gamma = 2 * ((i + 2) ** -1)

        mat_g = numpy.dot(mat_mtm, mat_z) - mat_mtx

        argmn = numpy.argmin(mat_g, axis = 0)
        
        mat_z = mat_z * (1 - gamma)

        mat_z[argmn,index] = mat_z[argmn,index] + gamma

    return mat_z

<h3 style = "font-family: Cambria;"> Archetypal Analysis for K-means Clustering </h3>

In [5]:
def archetypal_analysis(mat_x, k):

    mat_m = mat_x[:,numpy.random.choice(mat_x.shape[1], k, replace = False)] # m, n = mat_x.shape

    mat_y = numpy.zeros((mat_x.shape[1], k)); mat_y[0] = 1

    mat_z = numpy.zeros((k, mat_x.shape[1])); mat_z[0] = 1

    error = numpy.inf
    
    m_xtx = numpy.dot(numpy.transpose(mat_x), mat_x)
    
    for i in range(300):
        
        mat_mtm, mat_mtx = numpy.dot(numpy.transpose(mat_m), mat_m), numpy.dot(numpy.transpose(mat_m), mat_x)

        mat_z = matrix_z(mat_mtm, mat_mtx, mat_z)

        mat_zzt, mat_xxz = numpy.dot(mat_z, numpy.transpose(mat_z)), numpy.dot(m_xtx, numpy.transpose(mat_z))
        
        mat_y = matrix_y(m_xtx, mat_xxz, mat_zzt, mat_y)

        mat_m = numpy.dot(mat_x, mat_y)

        if numpy.abs(error - numpy.linalg.norm(mat_x - numpy.dot(mat_m, mat_z))) < 10 ** -5:

            break
        
        error = numpy.linalg.norm(mat_x - numpy.dot(mat_m, mat_z))

    mat_mtm, mat_mtx = numpy.dot(numpy.transpose(mat_m), mat_m), numpy.dot(numpy.transpose(mat_m), mat_x)
    
    mat_z = matrix_z(mat_mtm, mat_mtx, mat_z)

    return mat_m, mat_y, mat_z

<h3 style = "font-family: Cambria;"> Result </h3>

In [6]:
length, result = time.time(), {}

for i in range(500):
    
    temps = archetypal_analysis(numpy.array(numpy.transpose(train)), 7)[2]

    index = numpy.argmax(temps, axis = 0)

    dataframes = pandas.concat([train, pandas.DataFrame(index, columns = ["Clusters"], index = train.index)], axis = 1)
    
    dataframes = dataframes.groupby("Clusters").var(ddof = 0)

    count = numpy.unique(index, return_counts = True)[1]

    result[i] = [numpy.sum(count @ dataframes), index]

length = time.time() - length

In [7]:
index = numpy.argmin([result[i][0] for i in result])

colum = ["Clusters"]

frame = pandas.DataFrame(result[index][1], columns = colum, index = train.index)

print(frame)

        Clusters
Ticker          
A              5
AA             4
AAL            4
AAN            4
AAON           4
...          ...
ZIM            0
ZTO            4
ZTS            5
ZWS            5
ZYXI           4

[1152 rows x 1 columns]
