### Preparing training dataset in 2-column form

In [None]:
import pandas as pd

f = pd.read_csv("training_data.csv", sep=':')
f.info()
f.dtypes
f.describe()
f[f['post'].isin([None, ''])]
f.isnull().sum().sum()
nan_rows = f[f['post'].isnull()]
print(nan_rows)
keep_col = ['gender', 'post']
new_f = f[keep_col]
new_f.to_csv("training_data_two_columns.csv", index=False, sep=':')


In [None]:
column_name = ['gender']
new_f.info()
new_f.dtypes
new_f.describe()
new_f[new_f['post'].isin([None, ''])]
new_f.isnull().sum().sum()
nan_rows = new_f[new_f['post'].isnull()]
print(nan_rows)

### Install libraries

In [None]:
!pip3 install tensorflow==1.15.4
!pip3 install pandas

In [None]:
# get bert layer
!pip3 install bert-experimental --no-deps

### Models constructing

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from bert_experimental.finetuning.bert_layer import BertLayer

#### preparing parametres for 3 different models

In [None]:
# Base data kit

df = pd.read_csv("training_data_two_columns.csv",sep=":")
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df.gender.tolist())
n_classes = len(df.label.unique())

X = df.post.values
L = df.label.values
Y = tf.keras.utils.to_categorical(L)

trX, tsX, trY, tsY, trL, tsL = train_test_split(X, Y, L, random_state=34, test_size=0.1)

In [None]:
# Middle data kit (for Facebook only)

df = pd.read_csv("concat_data_two_columns.csv",sep=":")
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df.gender.tolist())
n_classes = len(df.label.unique())

X = df.post.values
L = df.label.values
Y = tf.keras.utils.to_categorical(L)

trX, tsX, trY, tsY, trL, tsL = train_test_split(X, Y, L, random_state=34, test_size=0.1)

In [None]:
# Big data kit (all, except Facebook)

df = pd.read_csv("concat_super_data_two_columns.csv",sep=":")
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df.gender.tolist())
n_classes = len(df.label.unique())

X = df.post.values
L = df.label.values
Y = tf.keras.utils.to_categorical(L)

trX, tsX, trY, tsY, trL, tsL = train_test_split(X, Y, L, random_state=34, test_size=0.1)

#### Model constructor

In [None]:
classification_model = None
module_path = "cased_L-24_H-1024_A-16_tf_module"
input_string = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
bert_encoder = BertLayer(module_path, pooling='mean', n_tune_layers=24, seq_len=256)
encoded_string = bert_encoder(input_string)
dense_hidden = tf.keras.layers.Dense(1024, activation='relu')(encoded_string)
dense_dropout = tf.keras.layers.Dropout(0.25)(dense_hidden)
dense_output = tf.keras.layers.Dense(n_classes, activation='sigmoid')(dense_dropout)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
classification_model = tf.keras.Model(inputs=[input_string], outputs=[dense_output])
classification_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
classification_model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping_callback = EarlyStopping(monitor='val_acc', patience=2)

In [None]:
classification_model.fit(trX, trY, validation_data=[tsX, tsY], batch_size=32, epochs=3,
                         callbacks=[early_stopping_callback])

#### Save different models

In [None]:
#Base model

classification_model.save('cls_model_256.h5')

In [None]:
#Middle model

classification_model.save('cls_middle_model_256.h5')

In [None]:
#Big model

classification_model.save('cls_big_model_256.h5')
