In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
import numpy as np
import pickle
import json

In [2]:
with open('data/cleaned.pkl', 'rb') as f:
    X = pickle.load(f)

with open('data/target.pkl', 'rb') as f:
    y = pickle.load(f)

In [3]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(X)

In [4]:
X = X.toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
y = np.asarray(y).astype('float32').reshape((-1,1))
y

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

In [6]:
n_features = len(vectorizer.vocabulary_.keys())
n_features

67721

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
np.save('data/x_test.npy', x_test)
np.save('data/y_test.npy', y_test)

In [11]:
n_samples = x_train.shape[0]
n_samples

5329

In [12]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [13]:
n_hidden_layers = 5

In [14]:
n_hidden_nodes = [int(num) for num in np.geomspace(n_features,  1, n_hidden_layers + 2)][1:-1]
n_hidden_nodes

[10607, 1661, 260, 40, 6]

In [15]:
dropouts = np.linspace(.25, .05, n_hidden_layers)
dropouts

array([0.25, 0.2 , 0.15, 0.1 , 0.05])

In [16]:
my_classifier = Sequential()

my_classifier.add(Input(x_train.shape[1]))

for i in range(n_hidden_layers):
    my_classifier.add(Dense(units=n_hidden_nodes[i],
                                kernel_initializer='uniform',
                                activation='relu'))
    my_classifier.add(Dropout(dropouts[i]))

my_classifier.add(Dense(units = 1,
                        kernel_initializer='uniform',
                        activation='sigmoid'))

my_classifier.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

Metal device set to: Apple M1 Pro


2022-01-24 17:14:44.127202: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-01-24 17:14:44.127356: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [17]:
my_classifier.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10607)             718327254 
                                                                 
 dropout (Dropout)           (None, 10607)             0         
                                                                 
 dense_1 (Dense)             (None, 1661)              17619888  
                                                                 
 dropout_1 (Dropout)         (None, 1661)              0         
                                                                 
 dense_2 (Dense)             (None, 260)               432120    
                                                                 
 dropout_2 (Dropout)         (None, 260)               0         
                                                                 
 dense_3 (Dense)             (None, 40)                1

In [18]:
history = my_classifier.fit(x_train,
                            y_train,
                            validation_split=0.2,
                            batch_size=int(n_samples*0.2),
                            epochs=20)

my_classifier.save('model.h5')

2022-01-24 17:14:46.489799: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/20


2022-01-24 17:14:46.885840: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-01-24 17:15:25.252093: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
with open('history.json', 'w') as f:
    json.dump(history.history, f)