In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout



In [3]:
# Load the datasets
big_table_by_average = pd.read_csv('../frameaxis/big_table_by_average.tsv', sep='\t')
dev_labels = pd.read_csv('../data/data/en/dev-labels-subtask-2.txt', sep='\t', header=None, names=['article_id', 'frames'])

# Merge the datasets
merged_data = pd.merge(big_table_by_average, dev_labels, on='article_id')

merged_data.head()

Unnamed: 0,article_id,"('absorbent', 'nonabsorbent')","('atypical', 'typical')","('born', 'unborn')","('broad-minded', 'narrow-minded')","('inspiring', 'uninspiring')","('jawed', 'jawless')","('made', 'unmade')","('negative', 'positive')","('pointed', 'pointless')","('scrupulous', 'unscrupulous')",frames
0,813452859,0.080125,0.015796,-0.09131,0.022744,-0.070299,0.082165,-0.090066,-0.033324,0.056182,-0.122622,"Political,External_regulation_and_reputation,P..."
1,813494037,0.074769,0.024556,-0.083949,0.022723,-0.08087,0.076549,-0.08203,-0.036595,0.055352,-0.118175,"Political,Crime_and_punishment,External_regula..."
2,813547724,0.084561,0.007195,-0.087211,0.011782,-0.078827,0.083936,-0.094876,-0.032868,0.05555,-0.116927,"Political,Quality_of_life,External_regulation_..."
3,813552066,0.080229,0.019845,-0.095069,0.01693,-0.075646,0.080472,-0.088424,-0.026648,0.064943,-0.124149,"Public_opinion,Policy_prescription_and_evaluat..."
4,813601978,0.068588,0.02688,-0.090958,0.022179,-0.074495,0.081351,-0.082409,-0.025719,0.052067,-0.11209,"Political,Morality,Cultural_identity,Crime_and..."


In [4]:
# Transform the frames column to binary format
merged_data['frames'] = merged_data['frames'].str.split(',')
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(merged_data['frames'])

In [5]:
# Split the data into training and validation sets
X = merged_data.drop(columns=['article_id', 'frames'])
y = pd.DataFrame(binary_labels, columns=mlb.classes_)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Build the neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(y_train.shape[1], activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1408      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 14)                910       
                                                                 
Total params: 10,574
Trainable params: 10,574
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x29f51dcf7c0>