<h1><strong>Poetry Classification</strong></h1>

In this project, I will try to classify poetry ages according to their content.

In [5]:
#First, I am importing the dependencies. The data is from kaggle.com

import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud, STOPWORDS
from keras.preprocessing.text import Tokenizer
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
from tensorflow.keras.layers import GRU, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Activation, Dense, Bidirectional
import gc
import re

In [7]:
#I'm importing the data
df=pd.read_csv("poems.csv")

In [8]:
#Exploring the data, looking at its shape, and the unique values of different features
df.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [9]:
df.shape

(573, 5)

In [12]:
df["author"].unique()

array(['WILLIAM SHAKESPEARE', 'DUCHESS OF NEWCASTLE MARGARET CAVENDISH',
       'THOMAS BASTARD', 'EDMUND SPENSER', 'RICHARD BARNFIELD',
       'SIR WALTER RALEGH', 'QUEEN ELIZABETH I', 'JOHN DONNE',
       'JOHN SKELTON', 'CHRISTOPHER MARLOWE', 'LADY MARY WROTH',
       'ROBERT SOUTHWELL, SJ', 'WILLIAM BYRD', 'GEORGE GASCOIGNE',
       'HENRY VIII, KING OF ENGLAND', 'SIR THOMAS WYATT', 'EN JONSON',
       'ORLANDO GIBBONS', 'THOMAS NASHE', 'SIR PHILIP SIDNEY',
       'SECOND BARON VAUX OF HARROWDEN THOMAS, LORD VAUX',
       'HENRY HOWARD, EARL OF SURREY', 'GEORGE CHAPMAN', 'THOMAS CAMPION',
       'ISABELLA WHITNEY', 'SAMUEL DANIEL', 'THOMAS HEYWOOD',
       'GIOVANNI BATTISTA GUARINI', 'SIR EDWARD DYER', 'THOMAS LODGE',
       'JOHN FLETCHER', 'EDGAR LEE MASTERS', 'WILLIAM BUTLER YEATS',
       'FORD MADOX FORD', 'IVOR GURNEY', 'CARL SANDBURG', 'EZRA POUND',
       'ELINOR WYLIE', 'GEORGE SANTAYANA', 'LOUISE BOGAN',
       'KENNETH SLESSOR', 'HART CRANE', 'D. H. LAWRENCE',
       'H

In [13]:
df["age"].unique()

array(['Renaissance', 'Modern'], dtype=object)

In [14]:
df["type"].unique()

array(['Mythology & Folklore', 'Nature', 'Love'], dtype=object)

In [17]:
#I'm manipulating the data here, by removing all the noisy special and numeric characters that are nothing to do with the next steps

def character_normalization(content, remove_digits=True):
    content=re.sub('[^a-zA-Z.\d\s]', '',content)
    return content
df["content"]=df["content"].apply(character_normalization)

In [19]:
#Since there are 2 types of ages, I'm changing them to zeros and ones so that I can classify the content as 0 or 1.

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df["age"]=le.fit_transform(df["age"])
df

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,1,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,Sir Charles into my chamber coming in\r\nWhen ...,An Epilogue to the Above,1,Mythology & Folklore
2,THOMAS BASTARD,Our vice runs beyond all that old men saw\r\nA...,"Book 7, Epigram 42",1,Mythology & Folklore
3,EDMUND SPENSER,Lo I the man whose Muse whilome did maske\r\nA...,"from The Faerie Queene: Book I, Canto I",1,Mythology & Folklore
4,RICHARD BARNFIELD,Long have I longd to see my love againe\r\nSti...,Sonnet 16,1,Mythology & Folklore
...,...,...,...,...,...
568,SARA TEASDALE,With the man I love who loves me not\r\nI walk...,Union Square,0,Love
569,HART CRANE,Hart Crane Voyages I II III IV V VI from The C...,Voyages,0,Love
570,WILLIAM BUTLER YEATS,When you are old and grey and full of sleep\r\...,When You Are Old,0,Love
571,CARL SANDBURG,Give me hunger\r\nO you gods that sit and give...,At a Window,0,Love


In [23]:
#Removing all the unnecessary features from the data

df2=df.drop(columns=["author", "poem name","type"])

In [24]:
#####BURASI SİLİNEBİLİR
df2

Unnamed: 0,content,age
0,Let the bird of loudest lay\r\nOn the sole Ara...,1
1,Sir Charles into my chamber coming in\r\nWhen ...,1
2,Our vice runs beyond all that old men saw\r\nA...,1
3,Lo I the man whose Muse whilome did maske\r\nA...,1
4,Long have I longd to see my love againe\r\nSti...,1
...,...,...
568,With the man I love who loves me not\r\nI walk...,0
569,Hart Crane Voyages I II III IV V VI from The C...,0
570,When you are old and grey and full of sleep\r\...,0
571,Give me hunger\r\nO you gods that sit and give...,0


In [29]:
#Here, I'm doing tokenization and converting the tokens into matrices.

tokenizer=Tokenizer()
tokenizer.fit_on_texts(df2["content"])
sequences=tokenizer.texts_to_sequences(df2["content"])
tokenized_df=tokenizer.texts_to_matrix(df2["content"])
word_index=tokenizer.word_index

In [28]:
tokenized_df

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 1., 1., 1.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]])

In [30]:
#Now, I'm splitting the data as X and y, which represent independent and dependent variables of the data
X=tokenized_df
y=df["age"]

In [32]:
#Splitting the data as train and test sets
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.2)

In [33]:
X_train=tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=300)
X_test=tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=300)

In [37]:
#Constructing my model
max_features=10
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
model = tf.keras.Sequential([Embedding(input_dim=100, output_dim=128),
                            LSTM(64,activation='relu', dropout=0.05, return_sequences=True),
                            LSTM(64, activation="relu",dropout=0.05,recurrent_dropout=0.05, return_sequences=True),
                            LSTM(32, activation="relu",dropout=0.05,recurrent_dropout=0.05),
                            Dense(2, activation="relu"),
                            Dense(1, activation="sigmoid")])
opt=tf.keras.optimizers.RMSprop()
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["acc"])

In [38]:
model.fit(X_train, y_train.values, epochs=100, batch_size=40, validation_split=0.1, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fbdf1f2a710>

In [39]:
model.evaluate(X_test, y_test)



[0.6843602657318115, 0.6000000238418579]