In [1]:
import os
import random
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import string
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords , wordnet
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.stem import WordNetLemmatizer

About the Dataset :-
The dataset contains the Poem itself , Name of the Author , Poem Name , Age & Genre 
The dataset contains 573 Poems.
The total number of words excluding stop words are about 42,000

In [2]:
data = pd.read_csv('all.csv')

In [3]:
# print(data.loc[:,'content'][0].lower())

In [4]:
Y = data.iloc[:,4]

In [5]:
targets = []
for genre in Y:
    if (genre == "Love"):
        targets.append(1)
    elif (genre == "Nature"):
        targets.append(2)
    else : targets.append(0)

In [6]:
# Data in form of target values
targets = np.array(targets)
targets.shape

(573,)

In [7]:
# Raw poem data without any preprocessing
X = data.iloc[:,1]
X = np.array(X)
X.shape

(573,)

In [8]:
# Fetching stopwords 
stops = stopwords.words('english')
punctations = list(string.punctuation)
stops += punctations

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else :
        return wordnet.NOUN

In [11]:
def clean_poems(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [12]:
tokenized = []
for i in range(len(X)):
    tokenized.append(word_tokenize(X[i]))

In [13]:
documents = []
for i in range(len(tokenized)):
    documents.append((tokenized[i],targets[i]))

In [14]:
#CLeaning all poems
documents = [(clean_poems(document) , y) for document ,y in documents]

In [15]:
X_clean = [ " ".join(document) for document,category in documents]

In [76]:
x_train, x_test, y_train, y_test = train_test_split(X_clean, targets, random_state= 42 ,test_size=0.20)

In [101]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, random_state= 1,test_size=0.25)

In [109]:
count_vec = CountVectorizer(max_features = 800 , ngram_range = (1,2))
x_train_features = count_vec.fit_transform(x_train)

In [110]:
x_test_features = count_vec.transform(x_test)
x_val_features = count_vec.transform(x_val)

In [111]:
# Classification using SkLearn Classifiers.
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [112]:
rfc= RandomForestClassifier(n_estimators=50, max_depth= 6)
rfc.fit(x_train_features, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [113]:
print("Training Score: " , end = "")
print(rfc.score(x_train_features, y_train))
print("Validation Score: " , end = "")
print(rfc.score(x_val_features, y_val))
print("Testing Score: " , end = "")
print(rfc.score(x_test_features, y_test))

Training Score: 0.7738095238095238
Validation Score: 0.6140350877192983
Testing Score: 0.7043478260869566


In [114]:
mnb = MultinomialNB()
mnb.fit(x_train_features, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [108]:
print("Training Score: " , end = "")
print(mnb.score(x_train_features, y_train))
print("Validation Score: " , end = "")
print(mnb.score(x_val_features, y_val))
print("Testing Score: " , end = "")
print(mnb.score(x_test_features, y_test))

Training Score: 0.8928571428571429
Validation Score: 0.6491228070175439
Testing Score: 0.7391304347826086
