In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Section 0: create dummy dataset

In [2]:
# Initialize dummy embeddings
embeddings = list(np.random.rand(10, 30, 128))
print(len(embeddings), 'matrices')
print('Each of size', embeddings[0].shape)

10 matrices
Each of size (30, 128)


In [3]:
# Initialize dummy cluster assignment
clusters = np.random.randint(low=0, high=6, size=10)
clusters

array([1, 3, 3, 1, 3, 5, 1, 3, 2, 3])

In [4]:
# Concatenate into a pandas dataframe
dummy_data = {
    'Company' : ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10'],
    'Embeddings' : embeddings,
    'Cluster' : clusters,
}

df = pd.DataFrame(dummy_data)
df

Unnamed: 0,Company,Embeddings,Cluster
0,C1,"[[0.8258112807923055, 0.7093365711600838, 0.69...",1
1,C2,"[[0.44706242179775046, 0.7119396749264167, 0.4...",3
2,C3,"[[0.6327397616895103, 0.49454607076609125, 0.2...",3
3,C4,"[[0.3520376338713338, 0.9940108795963187, 0.75...",1
4,C5,"[[0.9799358560660345, 0.5865449070453463, 0.48...",3
5,C6,"[[0.8506060550501765, 0.7980685609749092, 0.15...",5
6,C7,"[[0.5591508977811575, 0.9793517068709138, 0.33...",1
7,C8,"[[0.6948356182372877, 0.7009451736365836, 0.23...",3
8,C9,"[[0.14963114698496716, 0.45959908528823923, 0....",2
9,C10,"[[0.5129708058756499, 0.18101267339440674, 0.4...",3


In [None]:
# Save to disk
df.to_pickle('dummy_dataset.pkl')
del df

In [None]:
# Load back from disk
df = pd.read_pickle("dummy_dataset.pkl")
df

Unnamed: 0,Company,Embeddings,Cluster
0,C1,"[[0.6108360116243844, 0.47280295280953244, 0.4...",1
1,C2,"[[0.40764337130391215, 0.9127904109881451, 0.1...",0
2,C3,"[[0.7955283670949934, 0.17904580845690865, 0.1...",0
3,C4,"[[0.4674340486126384, 0.5832719196271765, 0.59...",4
4,C5,"[[0.2012101348417974, 0.9501009440517605, 0.27...",5
5,C6,"[[0.9335873569339934, 0.3644047243175257, 0.27...",1
6,C7,"[[0.31356810911294763, 0.39852582188462327, 0....",1
7,C8,"[[0.6280905106976582, 0.72680347623623, 0.7878...",1
8,C9,"[[0.6226514063978892, 0.7770111278478028, 0.87...",2
9,C10,"[[0.36758646381578575, 0.2987434066766984, 0.1...",3


# Section 1: MLP - fully connected

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

In [5]:
# Put input data into vector format for training MLP
X = np.array(df['Embeddings'].to_list())
print(X.shape)

X = X.reshape((len(X), -1))
print(X.shape)

(10, 30, 128)
(10, 3840)


In [12]:
# Put output data into vector format for training MLP
y = df['Cluster'].to_numpy()
print(y)

y = tf.one_hot(y, depth=6)
print(y)

[1 3 3 1 3 5 1 3 2 3]
tf.Tensor(
[[0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]], shape=(10, 6), dtype=float32)


In [13]:
# Creat MLP
mlp = Sequential([
                  Input(shape=(X.shape[1])),
                  Dense(512, activation='relu'),
                  Dense(128, activation='relu'),
                  Dense( 32, activation='relu'),
                  Dense(y.shape[1], activation='softmax')
])

mlp.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               1966592   
                                                                 
 dense_1 (Dense)             (None, 128)               65664     
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 6)                 198       
                                                                 
Total params: 2,036,582
Trainable params: 2,036,582
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Define training parameters
mlp.compile(optimizer='adam', loss='categorical_crossentropy')

In [15]:
# Train model
mlp.fit(x=X, y=y, batch_size=4, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fd0ed385c90>

# Section 2: Conv NN

In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten

In [17]:
# Put input data into vector format for training MLP
X = np.array(df['Embeddings'].to_list())
print(X.shape)

X = X.reshape((len(X), X.shape[1], X.shape[2], 1))
print(X.shape)

(10, 30, 128)
(10, 30, 128, 1)


In [18]:
# Put output data into vector format for training MLP
y = df['Cluster'].to_numpy()
print(y)

y = tf.one_hot(y, depth=6)
print(y)

[1 3 3 1 3 5 1 3 2 3]
tf.Tensor(
[[0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]], shape=(10, 6), dtype=float32)


In [25]:
# Creat MLP
cnn = Sequential()
cnn.add(Input(shape=(X.shape[1], X.shape[2], X.shape[3])))
cnn.add(Conv2D(64, 3, padding='same', activation='relu'))
cnn.add(MaxPool2D())
cnn.add(Conv2D(128, 3, padding='same', activation='relu'))
cnn.add(MaxPool2D())
cnn.add(Conv2D(256, 3, padding='same', activation='relu'))
cnn.add(MaxPool2D())
cnn.add(Conv2D(512, 3, padding='same', activation='relu'))
cnn.add(MaxPool2D())
cnn.add(Flatten())
cnn.add(Dense(32, activation='relu'))
cnn.add(Dense(y.shape[1], activation='softmax'))
cnn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_13 (Conv2D)          (None, 30, 128, 64)       640       
                                                                 
 max_pooling2d_7 (MaxPooling  (None, 15, 64, 64)       0         
 2D)                                                             
                                                                 
 conv2d_14 (Conv2D)          (None, 15, 64, 128)       73856     
                                                                 
 max_pooling2d_8 (MaxPooling  (None, 7, 32, 128)       0         
 2D)                                                             
                                                                 
 conv2d_15 (Conv2D)          (None, 7, 32, 256)        295168    
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 3, 16, 256)      

In [27]:
# Define training parameters
cnn.compile(optimizer='adam', loss='categorical_crossentropy')

In [28]:
# Train model
cnn.fit(x=X, y=y, batch_size=4, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fd0e7b461d0>

# Section 3: LSTM

In [29]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, LSTM

In [30]:
# Put input data into vector format for training MLP
X = np.array(df['Embeddings'].to_list())
print(X.shape)

(10, 30, 128)


In [31]:
# Put output data into vector format for training MLP
y = df['Cluster'].to_numpy()
print(y)

y = tf.one_hot(y, depth=6)
print(y)

[1 3 3 1 3 5 1 3 2 3]
tf.Tensor(
[[0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]], shape=(10, 6), dtype=float32)


In [35]:
lstm = Sequential()
lstm.add(Input(shape=(X.shape[1], X.shape[2])))
lstm.add(LSTM(256, return_sequences=True))
lstm.add(LSTM(512))
lstm.add(Dense(32, activation='relu'))
lstm.add(Dense(y.shape[1], activation='softmax'))
lstm.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 30, 256)           394240    
                                                                 
 lstm_3 (LSTM)               (None, 512)               1574912   
                                                                 
 dense_6 (Dense)             (None, 32)                16416     
                                                                 
 dense_7 (Dense)             (None, 6)                 198       
                                                                 
Total params: 1,985,766
Trainable params: 1,985,766
Non-trainable params: 0
_________________________________________________________________


In [36]:
# Define training parameters
lstm.compile(optimizer='adam', loss='categorical_crossentropy')

In [37]:
# Train model
lstm.fit(x=X, y=y, batch_size=4, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fd0e8ed4990>