In [1]:
#importing important libraries
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tit4n/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
#importing dataset
reviews = load_files('txt_sentoken/')
#note it will loop through all the different directories contained in this txt_sentoken folder and generate different
#classes

In [3]:
#now we need to seperate the class and the documents i.e., we need documents in a seperate list and we also need
#corresponding classes to be in another list
X,y = reviews.data,reviews.target

In [5]:
y
#here 0 is for neg class because it was first folder in txt_senttoken so data collection is done
#note reviews are in X and different classes are in y

array([0, 1, 1, ..., 1, 0, 0])

In [11]:
#note if you use load_files for a very large dataset it will take more time for that what we can do is we can store
#this X and y as pickle file in python
#storing as pickel files
with open('X.pickle','wb') as f: #note these are byte type files that is why wb
    pickle.dump(X,f)
#note this will generate a X.pickle file in the working directory

In [12]:
with open('Y.pickle','wb') as f:
    pickle.dump(y,f)
#note this is how you persist the different py objects as pickle file and we are going to do the same thing when we
#are going to build our classifier because we want to persist our classifier as well for using that at a later point
#of time

In [14]:
#now we will see how to retrieve object stores in the pickle file
#unpickling a data set
with open('X.pickle','rb') as f:
    X = pickle.load(f)
    
with open('Y.pickle','rb') as f:
    y = pickle.load(f)
#right now nothing is going to happen because we alreaded have these files in the memory but we can pretty much load 

In [15]:
#preprocessing the data from dataset
corpus = [] #this will be a list of documents
for i in range (0,len(X)):
    review = re.sub(r'\W',' ',str(X[i])) #substituting all non word characters with ' '
    review = review.lower() #lowering everything
    review = re.sub(r'\s+[a-z]\s+',' ',review) #removing all single characters beacuse they are not important
    review = re.sub(r'^[a-z]\s+',' ',review) #if a sentence starts with single character it will remove it
    review = re.sub(r'\s+',' ',review) #this is to remove any extra spaces we generated
    corpus.append(review)

In [16]:
#creating the model
#we are going to create the BOW model and then convert it into TFIDF model
from sklearn.feature_extraction.text import CountVectorizer #to create a simple BOW model
#creating the object of CountVectorizer
vectorizer = CountVectorizer(max_features = 2000, min_df=3, max_df=0.6, stop_words = stopwords.words('english'))
#when we build the BOW after creating histogram we have to filter it and select the top n features/words and when we 
#will be building some real world models those models will contain a lot of words so we choose 2000 here it means we
#want 2000 most frequent words as features and we will exclude all the rest.
#min_df = min document frequency when it will select words from histogram then it will exclude all those words that
#appear in 3 documents or less than 3 documents. max_df =max document frequency here we have specified it as percent
#0.6 means we are going to exclude all the different words that appear in 60% of the documents or more than that
#so we are only focused towards getting the most important words from corpus. stop_words are also specified here we
#are telling the CountVectorizer that you will exclude all the words that are contained in this list of stop words.
#so after creating the histogram, when it is filtering then it will do these 3 things.

X = vectorizer.fit_transform(corpus).toarray()

In [17]:
print(X)
#we can see in variable explorer it is a 2000*2000 array
#2000 rows is because we have 2000 positive or negative documents in the corpus and 2000 columns mean 2000 diffrent 
#features. Note: max_features was specified to be 2000 hence 2000 columns

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [2 0 1 ... 0 0 0]]


In [18]:
#now we will transform this to TFIDF model using prebuilt class
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = transformer.fit_transform(X).toarray()
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06887219, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12007883, 0.        , 0.06321361, ..., 0.        , 0.        ,
        0.        ]])

In [19]:
#now we are going to divide this dataset into 2 seperate datasets 1 we will use for traning our model and the other
#for testing the model performance
#so we will use 1600 for training the model and about 400 for testing using sklearn

from sklearn.model_selection import train_test_split
text_train,text_test,sent_train,sent_test = train_test_split(X,y,test_size = 0.2,random_state = 0) 
#y is list of different classes i.e., 2000 classes with 0's and 1's 0 is negative 1 is positive so for each of the 
#rows of X we have a y. Here test_size = 0.2 means 80 percent will be for training and 20 percent for testing
#its like seed variable for random in c++,jave,etc so anytime we use same seed variable we will get same result
#this is going to return 4 different values where text_train is the text or the list of documents that we are going
#to be using for training the whole document, text_test is for testing, sent_train are the different sentiment classes
#associated with the text_train and sent_test are the different sentiment classes associated with text_test

In [20]:
str(len(text_train)) + "x" +str(len(text_train[0])) #this is done to show dimentions of the matrix
#so here we can see it contains about 1600 different rows and 2000 features

'1600x2000'

In [21]:
str(len(text_test)) + "x" +str(len(text_test[0])) #this is done to show dimentions of the matrix
#so here we can see it contains about 400 different rows and 2000 features
#so no. of features are constant but we are varying the no. of rows

'400x2000'

In [22]:
#now we are going to use the Logistic Regression ML Algorithm for fitting this whole model and building the classifier
#now to use logistic regression we need to import logistic regression class from sklearn
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression() #making object 
#now we will train our model
classifier.fit(text_train,sent_train)

LogisticRegression()

In [23]:
#now we are going to test our model using test set
#to store all the predictions
sent_pred = classifier.predict(text_test)
sent_pred

array([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,

In [24]:
#now note that the result we got is the prediction and we also have actual values in sent_test now we want to compare
#the two to find how much accurate our model is compare to humans
# to do that we have a very good function in sklearn
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(sent_test,sent_pred)
cm
#Here through the columns we have the actual values and for the rows we have predicted values
#Visualizing it:
#   |  0  |  1
#  0| 166 | 42
#  1| 27  | 165
#so what it means is for 166 predictions were really negative and our model has also predicted them correctly
#and 27 were actually negative but our model has predicted them to be positive. Now similarly for the positive classes
#42 results were actually poitive but our model predicted it wrong and 165 were negative which it predicted correctly.

array([[168,  40],
       [ 21, 171]])

In [25]:
#finding the accurary rate we just need to add them to do this
cm[0][0] + cm[1][1] #adding correctly predicted values
#so we have 331 correct predictions out of 400

339

In [27]:
331/4  #(331/400)*100 simple percentage calculation
#this is the accuracy we got. So, our model is 82.75 % accurate. Whis is pretty go more than 80 percent that too with
#only 2000 rows of documents. So if we had like 100k or 200k we will get much higher accuracy or if we change
#max_features value we will also get much higher accuracy

82.75

In [28]:
#now we are going to save our model and vectorizer so that we can use it later. So, we will save this model as a
#pickle file and we will import this classifier later in another project.

#Pickling the classifier
with open('classifier.pickle','wb') as f:
    pickle.dump(classifier,f) #note we did this earlier

In [30]:
#now we also need to dump another thing because later we can't just do classifier.predict("aa bb") because it expects
#that we pass it a vectorized input with 2k features because we have trained it like that so to do that we will also
#save the vectorizer but here we haven't use a TFIDF vectorize. Here we have just used a count vectorizer and then
#transformed it using a TFIDF transformer. So, if we want to use this stratergy we have to save both the transformer
#and vectorizer as pickle file. Just to make it simple we will vectorize this whole corpus again using the TFIDF
#vectorizer. So we are not going to train the model again we will just tfidf vectorize it.
#Note here we are just going to repeate few thing we did earlier.

#unpickling the dataset previously stored in pickle files
with open('X.pickle','rb') as f:
    X = pickle.load(f)
    
with open('Y.pickle','rb') as f:
    y = pickle.load(f)

#Creating the corpous
corpus = [] #this will be a list of documents
for i in range (0,len(X)):
    review = re.sub(r'\W',' ',str(X[i])) #substituting all non word characters with ' '
    review = review.lower() #lowering everything
    review = re.sub(r'\s+[a-z]\s+',' ',review) #removing all single characters beacuse they are not important
    review = re.sub(r'^[a-z]\s+',' ',review) #if a sentence starts with single character it will remove it
    review = re.sub(r'\s+',' ',review) #this is to remove any extra spaces we generated
    corpus.append(review)

#Feeding it to tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000, min_df=3, max_df=0.6, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()
#note here we haven't retrained the model or retested it.

In [31]:
#now we will store this new vectorizer
#Pickling the vectorizer
with open('tfidfmodel.pickle','wb') as f:
    pickle.dump(vectorizer,f)

In [32]:
#now we will see 2 new files in the pwd
#classifier.pickle and tfidfmodel.pickle
#now we can use these files in our sentiment anlysis at a later point in time.
#now we will import our saved classifier and vectorizer and we will predict weather a simple sentence is positive or
#negative. 

#note usually we will do this in a seperate file where we haven't trained our model but this is just an example.
#Unpickling classifier
with open('classifier.pickle','rb') as f:
    clf = pickle.load(f)
clf

LogisticRegression()

In [33]:
#Unpickling vectorizer
with open('tfidfmodel.pickle','rb') as f:
    tfidf = pickle.load(f)
tfidf

TfidfVectorizer(max_df=0.6, max_features=2000, min_df=3,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [37]:
#now lets try using it. On a simple sentence.
sample = ["you are a good person"]
sample = tfidf.transform(sample).toarray()
#note here we are not fitting rather we are just transforming based on already fitted corpous
#what this will do is it will make sample seem like a document from corpous
print(clf.predict(sample))
#1 means positive so it has predicted it correctly

[1]


In [39]:
#doing some more predictions
sample = ["you are a bad person"]
sample = tfidf.transform(sample).toarray()
print(clf.predict(sample))
#0 means negative so it has predicted it correctly

[0]


In [42]:
sample = ["i hate you"]
sample = tfidf.transform(sample).toarray()
print(clf.predict(sample))
#wrong prediction

[1]
