#### Importing the libraries

In [20]:
import pandas as pd
import numpy as np

In [21]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [23]:
from collections import defaultdict

Set the random seed, in order to guarentee reproducability across runs (consistency of results)

In [24]:
#Set Random seed
np.random.seed(7)

#### Loading the corpus

In [25]:
Corpus = pd.read_csv("corpus.csv",encoding='latin-1')

In [38]:
print(type(Corpus))
print(Corpus.shape)
print(Corpus[:10])

Corpus = Corpus[:1000]
print(Corpus.shape)

<class 'pandas.core.frame.DataFrame'>
(1000, 3)
                                                text        label  \
0  [stuning, even, for, the, non-gamer, :, this, ...  __label__2    
1  [the, best, soundtrack, ever, to, anything, .,...  __label__2    
2  [amazing, !, :, this, soundtrack, is, my, favo...  __label__2    
3  [excellent, soundtrack, :, i, truly, like, thi...  __label__2    
4  [remember, ,, pull, your, jaw, off, the, floor...  __label__2    
5  [an, absolute, masterpiece, :, i, am, quite, s...  __label__2    
6  [buyer, beware, :, this, is, a, self-published...  __label__1    
7  [glorious, story, :, i, loved, whisper, of, th...  __label__2    
8  [a, five, star, book, :, i, just, finished, re...  __label__2    
9  [whispers, of, the, wicked, saints, :, this, w...  __label__2    

                                          text_final  
0  ['stun', 'even', 'sound', 'track', 'beautiful'...  
1  ['best', 'soundtrack', 'ever', 'anything', 're...  
2  ['amaze', 'soundtrack', 

#### Data Pre-processing
This will help in getting better results through the classification algorithms

In [39]:
# Step - 1a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)

In [40]:
# Step - 1b : Change all the text to lower case. This is a normal procedure as it helps "normalizing" the text.
Corpus['text'] = [entry.lower() for entry in Corpus['text']]

AttributeError: 'list' object has no attribute 'lower'

In [41]:
# Step - 1c : Tokenization : Each sample (text chunk) from the corpus is broken into a set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

TypeError: expected string or bytes-like object

In [None]:
# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [42]:
first_sample = Corpus['text'][0]
print(first_sample)

['stuning', 'even', 'for', 'the', 'non-gamer', ':', 'this', 'sound', 'track', 'was', 'beautiful', '!', 'it', 'paints', 'the', 'senery', 'in', 'your', 'mind', 'so', 'well', 'i', 'would', 'recomend', 'it', 'even', 'to', 'people', 'who', 'hate', 'video', 'game', 'music', '!', 'i', 'have', 'played', 'the', 'game', 'chrono', 'cross', 'but', 'out', 'of', 'all', 'of', 'the', 'games', 'i', 'have', 'ever', 'played', 'it', 'has', 'the', 'best', 'music', '!', 'it', 'backs', 'away', 'from', 'crude', 'keyboarding', 'and', 'takes', 'a', 'fresher', 'step', 'with', 'grate', 'guitars', 'and', 'soulful', 'orchestras', '.', 'it', 'would', 'impress', 'anyone', 'who', 'cares', 'to', 'listen', '!', '^_^']


In [43]:
Final_words=[]
# Initializing WordNetLemmatizer()
word_Lemmatized = WordNetLemmatizer()
# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
pos_tag_result = pos_tag(first_sample)
print(pos_tag_result)
for word, tag in pos_tag_result:
    # Below condition is to check for Stop words and consider only alphabets
    if word not in stopwords.words('english') and word.isalpha():
        word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
        Final_words.append(word_Final)
# The final processed set of words for each iteration will be stored in 'text_final'
print('text_final',str(Final_words))

[('stuning', 'VBG'), ('even', 'RB'), ('for', 'IN'), ('the', 'DT'), ('non-gamer', 'JJ'), (':', ':'), ('this', 'DT'), ('sound', 'NN'), ('track', 'NN'), ('was', 'VBD'), ('beautiful', 'JJ'), ('!', '.'), ('it', 'PRP'), ('paints', 'VBZ'), ('the', 'DT'), ('senery', 'NN'), ('in', 'IN'), ('your', 'PRP$'), ('mind', 'NN'), ('so', 'RB'), ('well', 'RB'), ('i', 'VB'), ('would', 'MD'), ('recomend', 'VB'), ('it', 'PRP'), ('even', 'RB'), ('to', 'TO'), ('people', 'NNS'), ('who', 'WP'), ('hate', 'VBP'), ('video', 'NNS'), ('game', 'NN'), ('music', 'NN'), ('!', '.'), ('i', 'NN'), ('have', 'VBP'), ('played', 'VBN'), ('the', 'DT'), ('game', 'NN'), ('chrono', 'NN'), ('cross', 'NN'), ('but', 'CC'), ('out', 'IN'), ('of', 'IN'), ('all', 'DT'), ('of', 'IN'), ('the', 'DT'), ('games', 'NNS'), ('i', 'VBP'), ('have', 'VBP'), ('ever', 'RB'), ('played', 'VBN'), ('it', 'PRP'), ('has', 'VBZ'), ('the', 'DT'), ('best', 'JJS'), ('music', 'NN'), ('!', '.'), ('it', 'PRP'), ('backs', 'VBZ'), ('away', 'RB'), ('from', 'IN'), ('c

In [44]:
#Check if the dataset was already processed before, as it takes several minutes. This is a good practice.

In [45]:
import pickle
import os
import os.path
if os.path.isfile("processed_corpus.pickle"):
    with open('processed_corpus.pickle', 'rb') as f:
        Corpus = pickle.load(f)
else:
    for index,entry in tqdm(enumerate(Corpus['text'])):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        # The final processed set of words for each iteration will be stored in 'text_final'
        Corpus.loc[index,'text_final'] = str(Final_words)
        
    with open("processed_corpus.pickle", "wb") as f:
        pickle.dump(Corpus, f)

NameError: name 'tqdm' is not defined

In [46]:
print(Corpus['text_final'].head())

0    ['stun', 'even', 'sound', 'track', 'beautiful'...
1    ['best', 'soundtrack', 'ever', 'anything', 're...
2    ['amaze', 'soundtrack', 'favorite', 'music', '...
3    ['excellent', 'soundtrack', 'truly', 'like', '...
4    ['remember', 'pull', 'jaw', 'floor', 'hear', '...
Name: text_final, dtype: object


In [47]:
print(Corpus[:5])

                                                text        label  \
0  [stuning, even, for, the, non-gamer, :, this, ...  __label__2    
1  [the, best, soundtrack, ever, to, anything, .,...  __label__2    
2  [amazing, !, :, this, soundtrack, is, my, favo...  __label__2    
3  [excellent, soundtrack, :, i, truly, like, thi...  __label__2    
4  [remember, ,, pull, your, jaw, off, the, floor...  __label__2    

                                          text_final  
0  ['stun', 'even', 'sound', 'track', 'beautiful'...  
1  ['best', 'soundtrack', 'ever', 'anything', 're...  
2  ['amaze', 'soundtrack', 'favorite', 'music', '...  
3  ['excellent', 'soundtrack', 'truly', 'like', '...  
4  ['remember', 'pull', 'jaw', 'floor', 'hear', '...  


In [48]:
# Step - 2: Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)

In [49]:
# Step - 3: Label encode the target variable
#This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [50]:
# Step - 4: Vectorize the words by using TF-IDF Vectorizer
#This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [51]:
# Step - 5: Run different algorithms to classify our data and check their accuracy

# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  81.33333333333333


In [55]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.LinearSVC()
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
distances = SVM.decision_function(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(list(zip(predictions_SVM,distances))[:5])

SVM Accuracy Score ->  80.0
[(1, 0.1377201131403633), (1, 0.690135622866267), (0, -0.19818765024275947), (0, -0.44803649213405816), (0, -0.7789136954150734)]


Code largely based on https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34