# Práctica modulo 3

## Preparación de ambiente

### Carga de módulos

In [1]:
!pip install geopandas
!pip install keplergl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Data Wrangling
import pandas as pd
import geopandas as gpd

# Data Visualization
import cufflinks as cf
from keplergl import KeplerGl
from plotly.figure_factory import create_dendrogram

# Data Preprocessing
from sklearn.manifold import MDS
from sklearn.preprocessing import MinMaxScaler

# Unsupervised Learning
import scipy.cluster.hierarchy as sch
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import AgglomerativeClustering

# Environment setup
cf.go_offline()
pd.set_option('display.float_format', lambda x: "{:,.2f}".format(x))
pd.set_option('display.max_columns', None)

### Funciones relevantes

In [3]:

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
            },
          });
        </script>
        '''))

In [4]:
def pivot_categories(df, cluster_column, categories):
    aux = df.copy()
    aux["dum"] = 1
    for i, col in enumerate(categories):
        agg_data = aux[[cluster_column, col, "dum"]].pivot_table(index = cluster_column, columns=col, aggfunc="sum", fill_value=0)
        agg_data.columns = [f"{col}_{x}" for x in agg_data.columns.droplevel(0)]
        total = agg_data.sum(axis=1)
        for col in agg_data:
            agg_data[col] /= total
        if i == 0:
            final = agg_data.copy()
        else:
            final = final.merge(agg_data, left_index = True, right_index = True, how = "inner")
    return final

## Datos

### Lectura de sets

In [5]:
account = pd.read_csv("/content/account.asc",delimiter=';').drop(['date'],axis=True)#
card = pd.read_csv("/content/card.asc",delimiter=';').drop(['issued'],axis=True)#
client = pd.read_csv("/content/client.asc",delimiter=';')
disp = pd.read_csv("/content/disp.asc",delimiter=';')#

#Dejamos las variables que creemos importantes
district = pd.read_csv("/content/district.asc",delimiter=';')[['A1','A11']]#
loan = pd.read_csv("/content/loan.asc",delimiter=';').rename(columns={'amount':'monto_credito','date':'fecha_credito','duration':'duracion_credito'})
order = pd.read_csv("/content/order.asc",delimiter=';').drop(['bank_to'],axis=True)#

#Quitamos los valores que creemos que no aportan informacion
trans = pd.read_csv("/content/trans.asc",delimiter=';',low_memory=False).drop(['bank','account'],axis=True).rename(columns={'date':'fecha_transaccion'})#

### Limpieza de los datos

In [6]:
#Obtenemos las columnas las cuales sí cumplen con tener el 80% de los datos
Nulos1 = (1 - account.isnull().sum()/account.shape[0] > 0.8).values
Nulos2 = (1 - card.isnull().sum()/card.shape[0] > 0.8).values
Nulos3 = (1 - client.isnull().sum()/client.shape[0] > 0.8).values
Nulos4 = (1 - disp.isnull().sum()/disp.shape[0] > 0.8).values
Nulos5 = (1 - district.isnull().sum()/district.shape[0] > 0.8).values
Nulos6 = (1 - loan.isnull().sum()/loan.shape[0] > 0.8).values
Nulos7 = (1 - order.isnull().sum()/order.shape[0] > 0.8).values
Nulos8 = (1 - trans.isnull().sum()/trans.shape[0] > 0.8).values

#Obtenemos las filas las cuales sí cumplen con tener el 80% de los datos
Nulos1_ = (1 - account.isnull().sum(axis=1)/account.shape[1] > 0.8).values
Nulos2_ = (1 - card.isnull().sum(axis=1)/card.shape[1] > 0.8).values
Nulos3_ = (1 - client.isnull().sum(axis=1)/client.shape[1] > 0.8).values
Nulos4_ = (1 - disp.isnull().sum(axis=1)/disp.shape[1] > 0.8).values
Nulos5_ = (1 - district.isnull().sum(axis=1)/district.shape[1] > 0.8).values
Nulos6_ = (1 - loan.isnull().sum(axis=1)/loan.shape[1] > 0.8).values
Nulos7_ = (1 - order.isnull().sum(axis=1)/order.shape[1] > 0.8).values
Nulos8_ = (1 - trans.isnull().sum(axis=1)/trans.shape[1] > 0.8).values

account = account.loc[Nulos1_,Nulos1]
card = card.loc[Nulos2_,Nulos2]
client = client.loc[Nulos3_,Nulos3]
disp = disp.loc[Nulos4_,Nulos4]
district = district.loc[Nulos5_,Nulos5]
loan = loan.loc[Nulos6_,Nulos6]
order = order.loc[Nulos7_,Nulos7]
trans = trans.loc[Nulos8_,Nulos8]

### Union de las tablas

In [7]:
#Observamos si la tabla de t5 posee valores nulos
print('Valores nulos de Account')
print(account.isnull().sum())
print('_'*64)
print('Valores nulos de Card')
print(card.isnull().sum())
print('_'*64)
print('Valores nulos de Client')
print(client.isnull().sum())
print('_'*64)
print('Valores nulos de Disp')
print(disp.isnull().sum())
print('_'*64)

print('Valores nulos de District')
print(district.isnull().sum())
print('_'*64)
print('Valores nulos de Loan')
print(loan.isnull().sum())
print('_'*64)
print('Valores nulos de Order')
print(order.isnull().sum())
print('_'*64)
print('Valores nulos de Trans')
print(trans.isnull().sum())
print('_'*64)

Valores nulos de Account
account_id     0
district_id    0
frequency      0
dtype: int64
________________________________________________________________
Valores nulos de Card
card_id    0
disp_id    0
type       0
dtype: int64
________________________________________________________________
Valores nulos de Client
client_id       0
birth_number    0
district_id     0
dtype: int64
________________________________________________________________
Valores nulos de Disp
disp_id       0
client_id     0
account_id    0
type          0
dtype: int64
________________________________________________________________
Valores nulos de District
A1     0
A11    0
dtype: int64
________________________________________________________________
Valores nulos de Loan
loan_id             0
account_id          0
fecha_credito       0
monto_credito       0
duracion_credito    0
payments            0
status              0
dtype: int64
________________________________________________________________
Valores nul

In [8]:
#Rellenamos con las columnas con valores faltantes de trans
values = {'operation':trans['operation'].mode()[0]}

#Revisamos que se hayan llenado el DataFrame
trans.fillna(value=values,inplace=True)
print(trans.isnull().sum())

trans_id             0
account_id           0
fecha_transaccion    0
type                 0
operation            0
amount               0
balance              0
dtype: int64


In [9]:
loan

Unnamed: 0,loan_id,account_id,fecha_credito,monto_credito,duracion_credito,payments,status
0,5314,1787,930705,96396,12,8033.00,B
1,5316,1801,930711,165960,36,4610.00,A
2,6863,9188,930728,127080,60,2118.00,A
3,5325,1843,930803,105804,36,2939.00,A
4,7240,11013,930906,274740,60,4579.00,A
...,...,...,...,...,...,...,...
677,4989,105,981205,352704,48,7348.00,C
678,5221,1284,981205,52512,12,4376.00,C
679,6402,6922,981206,139488,24,5812.00,C
680,5346,1928,981206,55632,24,2318.00,C


In [10]:
account

Unnamed: 0,account_id,district_id,frequency
0,576,55,POPLATEK MESICNE
1,3818,74,POPLATEK MESICNE
2,704,55,POPLATEK MESICNE
3,2378,16,POPLATEK MESICNE
4,2632,24,POPLATEK MESICNE
...,...,...,...
4495,124,55,POPLATEK MESICNE
4496,3958,59,POPLATEK MESICNE
4497,777,30,POPLATEK MESICNE
4498,1573,63,POPLATEK MESICNE


In [11]:
card

Unnamed: 0,card_id,disp_id,type
0,1005,9285,classic
1,104,588,classic
2,747,4915,classic
3,70,439,classic
4,577,3687,classic
...,...,...,...
887,125,694,gold
888,674,4360,classic
889,322,2063,classic
890,685,4467,classic


In [12]:
disp

Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,OWNER
1,2,2,2,OWNER
2,3,3,2,DISPONENT
3,4,4,3,OWNER
4,5,5,3,DISPONENT
...,...,...,...,...
5364,13647,13955,11349,OWNER
5365,13648,13956,11349,DISPONENT
5366,13660,13968,11359,OWNER
5367,13663,13971,11362,OWNER


In [13]:
order

Unnamed: 0,order_id,account_id,account_to,amount,k_symbol
0,29401,1,87144583,2452.00,SIPO
1,29402,2,89597016,3372.70,UVER
2,29403,2,13943797,7266.00,SIPO
3,29404,3,83084338,1135.00,SIPO
4,29405,3,24485939,327.00,
...,...,...,...,...,...
6466,46334,11362,70641225,4780.00,SIPO
6467,46335,11362,78507822,56.00,
6468,46336,11362,40799850,330.00,POJISTNE
6469,46337,11362,20009470,129.00,


In [14]:
trans

Unnamed: 0,trans_id,account_id,fecha_transaccion,type,operation,amount,balance
0,695247,2378,930101,PRIJEM,VKLAD,700.00,700.00
1,171812,576,930101,PRIJEM,VKLAD,900.00,900.00
2,207264,704,930101,PRIJEM,VKLAD,1000.00,1000.00
3,1117247,3818,930101,PRIJEM,VKLAD,600.00,600.00
4,579373,1972,930102,PRIJEM,VKLAD,400.00,400.00
...,...,...,...,...,...,...,...
1056315,3626622,2906,981231,PRIJEM,VYBER,62.30,13729.40
1056316,3627616,2935,981231,PRIJEM,VYBER,81.30,19544.90
1056317,3625403,2869,981231,PRIJEM,VYBER,60.20,14638.20
1056318,3626683,2907,981231,PRIJEM,VYBER,107.50,23453.00


In [15]:
district

Unnamed: 0,A1,A11
0,1,12541
1,2,8507
2,3,8980
3,4,9753
4,5,9307
...,...,...
72,73,8746
73,74,10673
74,75,8819
75,76,8369


In [16]:
t1 = account.merge(loan,left_on='account_id',right_on='account_id').drop(['fecha_credito','monto_credito','duracion_credito'],axis=True)
t2 = card.merge(disp,left_on='disp_id',right_on='disp_id')
t3 = t1.merge(trans,left_on='account_id',right_on='account_id').drop(['fecha_transaccion'],axis=True)
t4 = t3.merge(t2,left_on='account_id',right_on='account_id')
t5 = t4.merge(order,left_on='account_id',right_on='account_id')
t6 = t5.merge(district,left_on='district_id',right_on='A1')
t6.drop(['trans_id'],axis=True)
t6

Unnamed: 0,account_id,district_id,frequency,loan_id,payments,status,trans_id,type,operation,amount_x,balance,card_id,disp_id,type_x,client_id,type_y,order_id,account_to,amount_y,k_symbol,A1,A11
0,5891,54,POPLATEK MESICNE,6202,5432.00,A,1736607,PRIJEM,VKLAD,900.00,900.00,874,7127,gold,7127,OWNER,38118,95617645,5432.30,UVER,54,9897
1,5891,54,POPLATEK MESICNE,6202,5432.00,A,1736609,PRIJEM,PREVOD Z UCTU,32594.00,33494.00,874,7127,gold,7127,OWNER,38118,95617645,5432.30,UVER,54,9897
2,5891,54,POPLATEK MESICNE,6202,5432.00,A,1736950,VYDAJ,VYBER,4500.00,28994.00,874,7127,gold,7127,OWNER,38118,95617645,5432.30,UVER,54,9897
3,5891,54,POPLATEK MESICNE,6202,5432.00,A,3673340,PRIJEM,VYBER,21.60,29015.60,874,7127,gold,7127,OWNER,38118,95617645,5432.30,UVER,54,9897
4,5891,54,POPLATEK MESICNE,6202,5432.00,A,1736610,PRIJEM,PREVOD Z UCTU,32594.00,61609.60,874,7127,gold,7127,OWNER,38118,95617645,5432.30,UVER,54,9897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104899,10243,44,POPLATEK MESICNE,7091,3151.00,C,3088187,PRIJEM,VKLAD,33098.00,119158.40,1165,12291,classic,12599,OWNER,44578,33131796,3151.30,UVER,44,8254
104900,10243,44,POPLATEK MESICNE,7091,3151.00,C,3088310,VYDAJ,VYBER,34800.00,84358.40,1165,12291,classic,12599,OWNER,44578,33131796,3151.30,UVER,44,8254
104901,10243,44,POPLATEK MESICNE,7091,3151.00,C,3088311,VYDAJ,VYBER,34900.00,49458.40,1165,12291,classic,12599,OWNER,44578,33131796,3151.30,UVER,44,8254
104902,10243,44,POPLATEK MESICNE,7091,3151.00,C,3088241,VYDAJ,VYBER KARTOU,3900.00,45558.40,1165,12291,classic,12599,OWNER,44578,33131796,3151.30,UVER,44,8254


In [17]:
#Quitamos los ids que no aportan
t6.drop(['card_id','disp_id','district_id','loan_id','account_id'],axis=1,inplace=True)

#Quitamos las columnas que solo poseen un valor en la Data completa
col_1_valor = []
for j in t6.columns:
    if t6[j].nunique() == 1:
        col_1_valor.append(j)
t6.drop(col_1_valor,axis=1,inplace=True)
t6

Unnamed: 0,frequency,payments,status,trans_id,type,operation,amount_x,balance,type_x,client_id,order_id,account_to,amount_y,k_symbol,A1,A11
0,POPLATEK MESICNE,5432.00,A,1736607,PRIJEM,VKLAD,900.00,900.00,gold,7127,38118,95617645,5432.30,UVER,54,9897
1,POPLATEK MESICNE,5432.00,A,1736609,PRIJEM,PREVOD Z UCTU,32594.00,33494.00,gold,7127,38118,95617645,5432.30,UVER,54,9897
2,POPLATEK MESICNE,5432.00,A,1736950,VYDAJ,VYBER,4500.00,28994.00,gold,7127,38118,95617645,5432.30,UVER,54,9897
3,POPLATEK MESICNE,5432.00,A,3673340,PRIJEM,VYBER,21.60,29015.60,gold,7127,38118,95617645,5432.30,UVER,54,9897
4,POPLATEK MESICNE,5432.00,A,1736610,PRIJEM,PREVOD Z UCTU,32594.00,61609.60,gold,7127,38118,95617645,5432.30,UVER,54,9897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104899,POPLATEK MESICNE,3151.00,C,3088187,PRIJEM,VKLAD,33098.00,119158.40,classic,12599,44578,33131796,3151.30,UVER,44,8254
104900,POPLATEK MESICNE,3151.00,C,3088310,VYDAJ,VYBER,34800.00,84358.40,classic,12599,44578,33131796,3151.30,UVER,44,8254
104901,POPLATEK MESICNE,3151.00,C,3088311,VYDAJ,VYBER,34900.00,49458.40,classic,12599,44578,33131796,3151.30,UVER,44,8254
104902,POPLATEK MESICNE,3151.00,C,3088241,VYDAJ,VYBER KARTOU,3900.00,45558.40,classic,12599,44578,33131796,3151.30,UVER,44,8254


In [18]:
X1 = t6.select_dtypes('number').groupby(['client_id']).median().reset_index()
X1

Unnamed: 0,client_id,payments,trans_id,amount_x,balance,order_id,account_to,amount_y,A1,A11
0,116,8573.00,30061.50,1436.00,40773.30,29561.00,69820374.00,1436.00,74.00,10673.00
1,127,7348.00,32726.00,4700.00,24915.00,29578.00,58251345.00,7348.00,21.00,9104.00
2,132,4516.00,34230.00,3050.00,45009.00,29585.00,18149984.00,3050.00,36.00,9198.00
3,158,7370.00,40561.50,7370.20,48969.30,29614.00,80542558.00,7370.20,40.00,9317.00
4,272,9112.00,68006.00,6500.00,49345.50,29743.50,34898033.50,9020.00,70.00,10177.00
...,...,...,...,...,...,...,...,...,...,...
165,13620,8192.00,3336078.50,8191.50,72823.60,45885.00,40114671.00,8191.50,16.00,8427.00
166,13690,3745.00,3353880.00,10097.00,70915.70,45974.00,73215025.00,3744.70,70.00,10177.00
167,13694,3745.00,3354544.50,5157.50,59824.95,45978.00,68096905.00,3744.70,1.00,12541.00
168,13750,6541.00,3369348.00,6541.20,64097.40,46055.50,74118154.50,8175.60,12.00,8754.00


In [19]:
X2 = t6.select_dtypes('object')
X2 = pd.concat((X2,t6['client_id']),axis=1)
X2 = X2.groupby(['client_id']).agg(lambda x:pd.Series.mode(x)[0]).reset_index()
for j in range(X2.shape[0]):
    if X2.iloc[j,-1] == ' ':
        X2.iloc[j,-1] = 'VACI'

X2 = pd.get_dummies(X2,drop_first=True)
X2

Unnamed: 0,client_id,frequency_POPLATEK PO OBRATU,frequency_POPLATEK TYDNE,status_B,status_C,status_D,type_VYDAJ,operation_VYBER,type_x_gold,type_x_junior,k_symbol_UVER,k_symbol_VACI
0,116,0,0,0,0,0,1,0,0,0,0,1
1,127,0,0,0,1,0,0,1,0,0,1,0
2,132,0,0,0,1,0,1,1,0,0,0,1
3,158,1,0,0,0,0,1,1,0,0,1,0
4,272,0,0,0,1,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
165,13620,0,0,0,0,0,1,1,0,0,1,0
166,13690,0,1,0,0,0,1,1,0,0,1,0
167,13694,0,1,0,0,0,1,1,0,0,1,0
168,13750,0,0,0,1,0,1,1,0,1,0,0


In [20]:
data = X2.merge(X1,left_on='client_id',right_on='client_id')
data

Unnamed: 0,client_id,frequency_POPLATEK PO OBRATU,frequency_POPLATEK TYDNE,status_B,status_C,status_D,type_VYDAJ,operation_VYBER,type_x_gold,type_x_junior,k_symbol_UVER,k_symbol_VACI,payments,trans_id,amount_x,balance,order_id,account_to,amount_y,A1,A11
0,116,0,0,0,0,0,1,0,0,0,0,1,8573.00,30061.50,1436.00,40773.30,29561.00,69820374.00,1436.00,74.00,10673.00
1,127,0,0,0,1,0,0,1,0,0,1,0,7348.00,32726.00,4700.00,24915.00,29578.00,58251345.00,7348.00,21.00,9104.00
2,132,0,0,0,1,0,1,1,0,0,0,1,4516.00,34230.00,3050.00,45009.00,29585.00,18149984.00,3050.00,36.00,9198.00
3,158,1,0,0,0,0,1,1,0,0,1,0,7370.00,40561.50,7370.20,48969.30,29614.00,80542558.00,7370.20,40.00,9317.00
4,272,0,0,0,1,0,1,1,0,0,0,0,9112.00,68006.00,6500.00,49345.50,29743.50,34898033.50,9020.00,70.00,10177.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,13620,0,0,0,0,0,1,1,0,0,1,0,8192.00,3336078.50,8191.50,72823.60,45885.00,40114671.00,8191.50,16.00,8427.00
166,13690,0,1,0,0,0,1,1,0,0,1,0,3745.00,3353880.00,10097.00,70915.70,45974.00,73215025.00,3744.70,70.00,10177.00
167,13694,0,1,0,0,0,1,1,0,0,1,0,3745.00,3354544.50,5157.50,59824.95,45978.00,68096905.00,3744.70,1.00,12541.00
168,13750,0,0,0,1,0,1,1,0,1,0,0,6541.00,3369348.00,6541.20,64097.40,46055.50,74118154.50,8175.60,12.00,8754.00


In [21]:
y = data.iloc[:,0]
X = data.iloc[:,1:]

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
sc = StandardScaler()
X_std = sc.fit_transform(X)
X_std

array([[-0.19127301, -0.35472217, -0.10910895, ..., -1.3913165 ,
         1.45036414,  0.83107616],
       [-0.19127301, -0.35472217, -0.10910895, ...,  1.19167823,
        -0.62817585, -0.3663275 ],
       [-0.19127301, -0.35472217, -0.10910895, ..., -0.68614845,
        -0.03990981, -0.29459012],
       ...,
       [-0.19127301,  2.81910773, -0.10910895, ..., -0.3826291 ,
        -1.41253056,  2.25666572],
       [-0.19127301, -0.35472217, -0.10910895, ...,  1.55326253,
        -0.98113547, -0.63343475],
       [-0.19127301, -0.35472217, -0.10910895, ..., -0.05162828,
         0.94053358, -0.58764493]])

In [36]:
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralCoclustering, SpectralClustering

In [26]:
Kmedias = KMeans(n_clusters=4,random_state=101)
Kmedias.fit(X_std)
cluster1 = Kmedias.predict(X_std)
cluster1

array([1, 3, 1, 2, 2, 2, 1, 2, 1, 1, 1, 0, 2, 1, 2, 2, 2, 3, 2, 2, 1, 1,
       1, 2, 1, 0, 1, 2, 1, 2, 1, 0, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2,
       1, 1, 1, 1, 1, 2, 1, 0, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 0,
       2, 2, 2, 0, 2, 2, 0, 0, 1, 3, 2, 2, 1, 1, 2, 2, 2, 0, 2, 1, 1, 1,
       2, 1, 2, 2, 1, 1, 1, 2, 3, 1, 0, 0, 0, 1, 2, 2, 2, 3, 2, 0, 1, 2,
       2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 0, 2, 1, 0, 0, 0, 1, 3, 3, 2, 1, 1,
       1, 0, 3, 0, 1, 0, 2, 1, 3, 1, 0, 0, 2, 1, 2, 2, 0, 2, 0, 1, 2, 1,
       2, 2, 0, 2, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 0, 1], dtype=int32)

In [29]:
AG = AgglomerativeClustering(n_clusters=4)
cluster2 = AG.fit_predict(X_std)
cluster2

array([2, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 3, 1, 0, 0, 0, 1, 0, 0, 1, 2,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 3, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 2, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 2, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1])

In [43]:
cluster =  pd.DataFrame({'Kmedias':cluster1,'Agglomerative':cluster2})
for j in range(cluster.shape[0]):
    cluster.iloc[j,0] = str(cluster.iloc[j,0])
    cluster.iloc[j,1] = str(cluster.iloc[j,1])
cluster

Unnamed: 0,Kmedias,Agglomerative
0,1,2
1,3,1
2,1,1
3,2,0
4,2,0
...,...,...
165,2,0
166,2,1
167,0,1
168,0,0


In [45]:
c = pd.get_dummies(cluster).values
Kmedias2 = KMeans(n_clusters=4,random_state=101)
Kmedias2.fit(c)
preds = Kmedias2.predict(c)
preds

array([3, 0, 3, 2, 2, 0, 3, 2, 3, 3, 3, 0, 2, 3, 2, 2, 2, 0, 2, 2, 3, 3,
       3, 2, 3, 0, 3, 2, 3, 2, 3, 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 1, 2, 2,
       1, 3, 3, 3, 3, 2, 3, 0, 0, 3, 0, 2, 2, 2, 3, 2, 2, 3, 2, 0, 3, 0,
       2, 0, 0, 0, 2, 2, 0, 0, 1, 0, 2, 2, 3, 3, 2, 0, 2, 0, 2, 3, 3, 1,
       0, 3, 0, 2, 3, 3, 1, 0, 0, 3, 0, 0, 0, 3, 2, 2, 2, 0, 2, 1, 3, 2,
       0, 3, 2, 2, 3, 3, 3, 0, 3, 3, 0, 2, 3, 1, 1, 0, 3, 0, 0, 0, 1, 3,
       3, 0, 0, 0, 3, 0, 0, 3, 1, 3, 1, 0, 2, 3, 2, 2, 1, 2, 0, 3, 2, 3,
       2, 2, 1, 2, 1, 0, 1, 3, 3, 0, 2, 2, 0, 0, 1, 3], dtype=int32)

In [47]:
data['clusters'] = preds
data

Unnamed: 0,client_id,frequency_POPLATEK PO OBRATU,frequency_POPLATEK TYDNE,status_B,status_C,status_D,type_VYDAJ,operation_VYBER,type_x_gold,type_x_junior,k_symbol_UVER,k_symbol_VACI,payments,trans_id,amount_x,balance,order_id,account_to,amount_y,A1,A11,clusters
0,116,0,0,0,0,0,1,0,0,0,0,1,8573.00,30061.50,1436.00,40773.30,29561.00,69820374.00,1436.00,74.00,10673.00,3
1,127,0,0,0,1,0,0,1,0,0,1,0,7348.00,32726.00,4700.00,24915.00,29578.00,58251345.00,7348.00,21.00,9104.00,0
2,132,0,0,0,1,0,1,1,0,0,0,1,4516.00,34230.00,3050.00,45009.00,29585.00,18149984.00,3050.00,36.00,9198.00,3
3,158,1,0,0,0,0,1,1,0,0,1,0,7370.00,40561.50,7370.20,48969.30,29614.00,80542558.00,7370.20,40.00,9317.00,2
4,272,0,0,0,1,0,1,1,0,0,0,0,9112.00,68006.00,6500.00,49345.50,29743.50,34898033.50,9020.00,70.00,10177.00,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,13620,0,0,0,0,0,1,1,0,0,1,0,8192.00,3336078.50,8191.50,72823.60,45885.00,40114671.00,8191.50,16.00,8427.00,2
166,13690,0,1,0,0,0,1,1,0,0,1,0,3745.00,3353880.00,10097.00,70915.70,45974.00,73215025.00,3744.70,70.00,10177.00,0
167,13694,0,1,0,0,0,1,1,0,0,1,0,3745.00,3354544.50,5157.50,59824.95,45978.00,68096905.00,3744.70,1.00,12541.00,0
168,13750,0,0,0,1,0,1,1,0,1,0,0,6541.00,3369348.00,6541.20,64097.40,46055.50,74118154.50,8175.60,12.00,8754.00,1
