In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
df1=pd.read_csv("Poem_classification - test_data.csv")
df2=pd.read_csv("Poem_classification - train_data.csv")

In [3]:
df1.shape, df2.shape

((150, 2), (841, 2))

In [4]:
df1.head()

Unnamed: 0,Genre,Poem
0,Music,A woman walks by the bench I’m sitting onwith ...
1,Music,"Because I am a boy, the untouchability of beau..."
2,Music,"Because today we did not leave this world,We n..."
3,Music,"Big Bend has been here, been here. Shouldn’t i..."
4,Music,"I put shells there, along the lip of the road...."


In [5]:
df1.sample(5)

Unnamed: 0,Genre,Poem
114,Affection,"It was easy enough to bend them to my wish, it..."
6,Music,"I was afraid the past would catch up with me,w..."
67,Affection,DEAR MISS: Notwithstanding the cloud of doubts...
115,Affection,"It's all I have to bring today—This, and my he..."
84,Affection,I am yours as the summer air at evening isPoss...


In [6]:
df1['Genre'].unique()

array(['Music', 'Death', 'Affection', 'Environment'], dtype=object)

In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Genre   150 non-null    object
 1   Poem    150 non-null    object
dtypes: object(2)
memory usage: 2.5+ KB


In [8]:
df1.isnull().sum()

Genre    0
Poem     0
dtype: int64

In [9]:
df2.head()

Unnamed: 0,Genre,Poem
0,Music,
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...


In [10]:
df2.isnull().sum()    

Genre    0
Poem     4
dtype: int64

In [11]:
df=df1.append(df2)  

In [12]:
df.shape

(991, 2)

In [13]:
df.head()

Unnamed: 0,Genre,Poem
0,Music,A woman walks by the bench I’m sitting onwith ...
1,Music,"Because I am a boy, the untouchability of beau..."
2,Music,"Because today we did not leave this world,We n..."
3,Music,"Big Bend has been here, been here. Shouldn’t i..."
4,Music,"I put shells there, along the lip of the road...."


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 991 entries, 0 to 840
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Genre   991 non-null    object
 1   Poem    987 non-null    object
dtypes: object(2)
memory usage: 23.2+ KB


In [15]:
df.isnull().sum()

Genre    0
Poem     4
dtype: int64

In [16]:
df = df.dropna(axis=0)

In [17]:
df.isnull().sum()

Genre    0
Poem     0
dtype: int64

In [18]:
df.Genre.value_counts()

Environment    252
Music          250
Death          244
Affection      241
Name: Genre, dtype: int64

In [19]:
df = df.replace({'Environment':0,'Music':1,'Death':2,'Affection':3})

In [20]:
df

Unnamed: 0,Genre,Poem
0,1,A woman walks by the bench I’m sitting onwith ...
1,1,"Because I am a boy, the untouchability of beau..."
2,1,"Because today we did not leave this world,We n..."
3,1,"Big Bend has been here, been here. Shouldn’t i..."
4,1,"I put shells there, along the lip of the road...."
...,...,...
836,0,Why make so much of fragmentary blue In here a...
837,0,"Woman, I wish I didn't know your name. What co..."
838,0,"Yonder to the kiosk, beside the creek, Paddle ..."
839,0,You come to fetch me from my work to-night Whe...


In [22]:
text=df["Poem"]
labels=df["Genre"]

In [24]:
text = np.array(text)
labels = np.array(labels)
text.shape, labels.shape

((987,), (987,))

In [25]:
training_text = text[0:901]
testing_text = text[901:]
training_labels = labels[0:901]
testing_labels = labels[901:]

In [27]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(training_text)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=200, padding='post', truncating='post')

testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=200, padding='post', truncating='post')

In [28]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [29]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(10000,16,input_length=200))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(50,activation='relu'))
model.add(tf.keras.layers.Dense(25,activation='relu'))
model.add(tf.keras.layers.Dense(4,activation='softmax'))
opt = tf.keras.optimizers.Adam()
model.compile(loss='sparse_categorical_crossentropy',optimizer=opt,metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 flatten (Flatten)           (None, 16)                0         
                                                                 
 dense (Dense)               (None, 50)                850       
                                                                 
 dense_1 (Dense)             (None, 25)                1275      
                                                                 
 dense_2 (Dense)             (None, 4)                 104       
                                                        

In [30]:
num_epochs = 100
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/100
29/29 - 8s - loss: 1.3837 - accuracy: 0.2775 - val_loss: 1.4541 - val_accuracy: 0.0000e+00 - 8s/epoch - 285ms/step
Epoch 2/100
29/29 - 1s - loss: 1.3798 - accuracy: 0.2775 - val_loss: 1.5161 - val_accuracy: 0.0000e+00 - 680ms/epoch - 23ms/step
Epoch 3/100
29/29 - 1s - loss: 1.3766 - accuracy: 0.2775 - val_loss: 1.5717 - val_accuracy: 0.0000e+00 - 618ms/epoch - 21ms/step
Epoch 4/100
29/29 - 1s - loss: 1.3741 - accuracy: 0.2775 - val_loss: 1.5828 - val_accuracy: 0.0000e+00 - 563ms/epoch - 19ms/step
Epoch 5/100
29/29 - 1s - loss: 1.3716 - accuracy: 0.2775 - val_loss: 1.6356 - val_accuracy: 0.0000e+00 - 524ms/epoch - 18ms/step
Epoch 6/100
29/29 - 0s - loss: 1.3682 - accuracy: 0.3119 - val_loss: 1.6783 - val_accuracy: 0.0000e+00 - 361ms/epoch - 12ms/step
Epoch 7/100
29/29 - 0s - loss: 1.3659 - accuracy: 0.2786 - val_loss: 1.6886 - val_accuracy: 0.0000e+00 - 492ms/epoch - 17ms/step
Epoch 8/100
29/29 - 1s - loss: 1.3590 - accuracy: 0.3274 - val_loss: 1.6709 - val_accuracy: 0.0000e

Epoch 66/100
29/29 - 0s - loss: 0.1890 - accuracy: 0.9046 - val_loss: 1.7559 - val_accuracy: 0.4186 - 487ms/epoch - 17ms/step
Epoch 67/100
29/29 - 0s - loss: 0.1866 - accuracy: 0.9123 - val_loss: 2.3021 - val_accuracy: 0.3140 - 354ms/epoch - 12ms/step
Epoch 68/100
29/29 - 0s - loss: 0.1948 - accuracy: 0.8935 - val_loss: 2.0666 - val_accuracy: 0.3837 - 369ms/epoch - 13ms/step
Epoch 69/100
29/29 - 0s - loss: 0.1815 - accuracy: 0.9079 - val_loss: 1.8964 - val_accuracy: 0.4070 - 378ms/epoch - 13ms/step
Epoch 70/100
29/29 - 0s - loss: 0.1842 - accuracy: 0.9023 - val_loss: 2.4153 - val_accuracy: 0.2791 - 450ms/epoch - 16ms/step
Epoch 71/100
29/29 - 0s - loss: 0.1793 - accuracy: 0.9068 - val_loss: 1.9178 - val_accuracy: 0.4070 - 399ms/epoch - 14ms/step
Epoch 72/100
29/29 - 0s - loss: 0.1816 - accuracy: 0.9145 - val_loss: 2.1198 - val_accuracy: 0.3605 - 370ms/epoch - 13ms/step
Epoch 73/100
29/29 - 0s - loss: 0.1790 - accuracy: 0.9201 - val_loss: 1.8230 - val_accuracy: 0.3837 - 363ms/epoch - 13