In [1]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
data = pd.read_csv('Final_Dataset.csv')

In [3]:
data.reset_index(drop=True, inplace=True)

In [4]:
#removing unncessary first column
data = data.drop(['Unnamed: 0'], axis = 1)

In [5]:
#checking if there is any duplicate poem in the dataset
data.duplicated(subset=['Poem']).sum()

0

Extracting Poem and Category column in a new dataframe so that we can work with it as these two are essential in classification

In [6]:
df = data[['Poem','Category']]

In [7]:
df.head()

Unnamed: 0,Poem,Category
0,\r\r\nThe Ant thinks about her nieces and neph...,Nature
1,Unveiled in the light of the rising dawn itsel...,Nature
2,\r\r\r\n\twithin the woods I wandered\r\r\r\n\...,Nature
3,\r\r\nWe shall sail through the air a thousand...,Nature
4,\r\r\n→ Show additional poem info\r\r\r\n\tWe ...,Nature


In [8]:
df.describe()

Unnamed: 0,Poem,Category
count,558,559
unique,558,4
top,\r\r\nDon’t buy from Messrs Wood & co\r\r\nThe...,Nature
freq,1,174


In [9]:
#checking the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559 entries, 0 to 558
Data columns (total 2 columns):
Poem        558 non-null object
Category    559 non-null object
dtypes: object(2)
memory usage: 8.8+ KB


In [10]:
#checking a single random poem to get the idea of what things we need to remove
df["Poem"][134]

'\r\r\r\n\tThis year the Murray came down in flood\r\r\r\n\tFor years we\'ve waited for this good.\r\r\r\n\tThe river red gums stand and wait\r\r\r\n\tfor the waters to reach our state.\r\r\r\n\tVictoria gets theirs and will divert\r\r\r\n\twater to wetlands far up stream.\r\r\r\n\tThese patient trees wait looking inert,\r\r\r\n\twaiting for the water of their dreams.\r\r\r\n\tSo many of them were chopped down\r\r\r\n\tin the paddle steamer days felled\r\r\r\n\tto transport goods to rail heads,\r\r\r\n\tthe river water is muddy brown.\r\r\r\n\tCockles in far off days were shelled\r\r\r\n\tas the aborigines foraged in reed beds.\r\r\r\n\tThe old gum trees remember those days\r\r\r\n\tas native and feral animals graze\r\r\r\n\tWater for pasture and orange groves\r\r\r\n\tsome of their wood harvested for the stoves.\r\r\r\n\tIn the hollow of the old gum tree\r\r\r\n\tsqwarks baby galahs waiting for tea.\r\r\r\n\tThe parents nest in a deep hollow\r\r\r\n\tthat sometimes a possum will want 

>As we can see here that there are several newline character and other things such as date, genre names, Copyright symbols etc.
We need to remove these unwanted characters because they are not useful in classification.
Now we are going to preprocess this data

# PreProcessing data

In [11]:
#removing all unwanted characters
chars_to_replace = ['\r\n', '\xa0','(\xA9|&copy;)\s+\w{2}\s+\w{3}\s+\d{2}\s+\d{4}\s+\d{2}:\d{2}\s+\w[am|pm|AM|PM]','\r\n\r\r\n\t','\t']
df['Poem'] = df.Poem.replace(['\r\n', 
                              '\xa0',
                              '(\xA9|&copy;)\s+\w{2}\s+\w{3}\s+\d{2}\s+\d{4}\s+\d{2}:\d{2}\s+\w[am|pm|AM|PM]',
                              '\r\n\r\r\n\t',
                              '\t',
                              '(\r)+',
                              '(\s){2,}',
                              '.((\w)+•\s)+(\w)+',
                              '((\s+)?\w+\s+(\•)?\s+\w+)+',
                              '(•)+',
                              '(\w)+•\s+\W*\w*',
                              '\d+',
                              '^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$'], ' ', regex=True)

#Removing Punctuations
df.Poem = df['Poem'].str.replace('[{}]'.format(string.punctuation), '')

#converting to lower case
df = df.apply(lambda x: x.astype(str).str.lower())

#removing stop words
stop_words = set(stopwords.words('english'))
df['Poem'] = df['Poem'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [12]:
#checking a single random poem after preprocessing
df["Poem"][134]

'year murray came flood years weve waited good river red gums stand wait waters reach state victoria gets divert water wetlands far stream patient trees wait looking inert waiting water dreams many chopped paddle steamer days felled transport goods rail heads river water muddy brown cockles far days shelled aborigines foraged reed beds old gum trees remember days native feral animals graze water pasture orange groves wood harvested stoves hollow old gum tree sqwarks baby galahs waiting tea parents nest deep hollow sometimes possum want borrow bark spiders hide larder making birds eating work much harder old gum trees home many would sad werent audrey christophersen year murray came flood years weve waited good river red gums stand wait waters reach state victoria gets divert water wetlands far stream patient trees wait looking inert waiting water dreams many chopped paddle steamer days felled transport goods rail heads river water muddy brown cockles far days shelled aborigines foraged

> Now we have preprocessed our data, we can move to the most important part of classification
Here I'm just performing a simple classification and that's why we can expect a very low accuracy.
For improving this accuracy we can use several Advance NLP techniques. 

In [13]:
#Importing Required Libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn import metrics
from sklearn.model_selection import StratifiedShuffleSplit

In [18]:
train_X, test_X, train_y, test_y = train_test_split(df['Poem'], df['Category'], test_size = 0.3)

In [19]:
#checking the shape of our training and testing data
print('Shape of training data(Poems) : ' + str(train_X.shape))
print('Shape of training data(Category) : ' + str(train_y.shape))
print('Shape of testing data(Poems) : ' + str(test_X.shape))
print('Shape of training data(Category) : ' + str(test_y.shape))

Shape of training data(Poems) : (391,)
Shape of training data(Category) : (391,)
Shape of testing data(Poems) : (168,)
Shape of training data(Category) : (168,)


# Multinomial Naive Bayes

In [20]:
#creating a pipeline so that all the steps can be combined
clf = Pipeline([ 
                    ('vect',TfidfVectorizer()),
                    ('clf', MultinomialNB()) 
                      ])

In [21]:
clf.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [22]:
predicted = clf.predict(test_X)

In [23]:
predicted

array(['nature', 'nature', 'humor', 'nature', 'nature', 'humor', 'humor',
       'nature', 'nature', 'nature', 'nature', 'humor', 'humor', 'nature',
       'nature', 'nature', 'nature', 'nature', 'humor', 'nature',
       'nature', 'nature', 'nature', 'nature', 'nature', 'nature',
       'nature', 'nature', 'humor', 'nature', 'nature', 'nature', 'humor',
       'nature', 'humor', 'nature', 'nature', 'humor', 'humor', 'nature',
       'nature', 'nature', 'nature', 'nature', 'nature', 'nature',
       'nature', 'nature', 'nature', 'nature', 'nature', 'nature',
       'nature', 'nature', 'nature', 'humor', 'nature', 'nature',
       'nature', 'nature', 'humor', 'nature', 'nature', 'nature', 'humor',
       'nature', 'nature', 'nature', 'nature', 'nature', 'nature',
       'nature', 'nature', 'nature', 'nature', 'nature', 'nature',
       'nature', 'nature', 'humor', 'humor', 'nature', 'nature', 'humor',
       'nature', 'nature', 'humor', 'nature', 'humor', 'nature', 'nature',
       'nat

In [24]:
print('Accuracy  is '+ str(np.mean(predicted == test_y)))
print(classification_report(test_y, predicted))

Accuracy  is 0.4583333333333333
              precision    recall  f1-score   support

       humor       0.77      0.56      0.65        43
        love       0.00      0.00      0.00        40
      nature       0.39      1.00      0.56        53
   spiritual       0.00      0.00      0.00        32

   micro avg       0.46      0.46      0.46       168
   macro avg       0.29      0.39      0.30       168
weighted avg       0.32      0.46      0.34       168



  'precision', 'predicted', average, warn_for)


# Using Support Vector Machine

In [25]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(train_X, train_y)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [26]:
y_pred = sgd.predict(test_X)
print('accuracy %s' % accuracy_score(y_pred, test_y))
print(classification_report(test_y, y_pred))

accuracy 0.5416666666666666
              precision    recall  f1-score   support

       humor       0.59      0.67      0.63        43
        love       0.50      0.30      0.37        40
      nature       0.51      0.79      0.62        53
   spiritual       0.62      0.25      0.36        32

   micro avg       0.54      0.54      0.54       168
   macro avg       0.55      0.50      0.50       168
weighted avg       0.55      0.54      0.51       168

