In [1]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
import re
np.set_printoptions(precision = 2)

# In the following Notebook, We will make a Sentiment Analysis from scratch. 
###### we will be designing a word2vec and tfidf class that mimic countvectorizer and TfidfTransformer from sklearn.feature_extraction.text.They are not made to serve as a definitive replacement for the two original functions, but rather to help us deepen our understanding of how a countvectorizer and tfidftransformer work.
###### Important information about the Dataset (taken from the dataset creator) : 
    1-no more than 30 reviews are allowed for any given movie because reviews for the same movie tend to have correlated ratings.
    2-train and test sets contain a disjoint set of movies.
    3-In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10.

# The First Task is to turn the reviews from a text file to a dataframe

In [2]:
def review_extraction(directory):
    '''
    extracting reviews from a directory
    '''
    reviews = []
    filenames = os.listdir(directory)
    for filename in filenames:
        with open(directory + filename,'r') as f:
            reviews.append(f.read() )
    return reviews

positive_dir = './train/pos/'
positive_reviews = review_extraction(positive_dir)

negative_dir = './train/neg/'
negative_reviews = review_extraction(negative_dir)

In [3]:
from sklearn.utils import shuffle
reviews0 = pd.DataFrame(np.c_[negative_reviews,np.zeros(len(negative_reviews),dtype = 'uint8')],
                        columns = ['reviews','sentiment'])

reviews1 = pd.DataFrame(np.c_[positive_reviews,np.ones(len(negative_reviews),dtype = 'uint8')],
                        columns = ['reviews','sentiment'])

reviews_train = reviews1.append(reviews0,ignore_index = True)
reviews_train = shuffle(reviews_train) # shuffling them to not keep ones at first and then zeros

In [4]:
reviews_train.head(5)# taking a look at our data

Unnamed: 0,reviews,sentiment
17065,"I felt brain dead, I'll tell you. This is the ...",0
14262,It's a ghost story. It's a cannibalism story. ...,0
1369,"The movie held my interest, mainly because Dia...",1
13567,I own almost every Seagal movie (yes even ones...,0
10883,Touching and sad movie. Portrays the trials an...,1


# Now, we will start making a Word2Vec class from scratch that will serve the purpose of this Project

In [5]:
class Word2Vec:

    def __init__(self):
        self.doc = []
        self.dico = {}
        self.text = [] #mostly used for storage of a past self

    def vocabulary(self):
        '''
        arr : list of text
        return dictionary of vocabulary 
        '''
        return self.dico
    
    def purge_alph(self):
        '''
        remove non alpha characters from doc

        '''
        new_doc = []
        for word in self.doc:
            regex = re.compile('[^a-zA-Z]')
            new_doc += regex.sub(' ', word).split()
        return new_doc
    
    def update(self,new_doc,arr):
        '''
        updating values of doc and dico and text
        '''
        self.doc = sorted(list(set(new_doc)))
        self.dico = dict(np.c_[self.doc,np.arange(len(self.doc))])
        self.text = self.text + list(arr)
        
    def to_array(self,arr = None,regex= re.compile('[^a-zA-Z]')):
        '''
        turning array of text to array of integers from the vocabulary,
        if a word is not in the vocabulary, it will return an error (you can remove this assertion)
        arr : list of texts, if none we use self.text
        
        you may notice 3 for loops, but the third one is only used to separate words that are separated 
        by a special character, so most of the time it only has one iteration.
        '''
        if type(arr) == type(None):
            arr = self.text
        final_vector = []
        for text in arr:
            vector = np.zeros(len(self.doc),dtype = 'int')
            for words in text.split(' '):
                words = regex.sub(' ', words).split()
                for word in words: #special cases when a special character is between two words for example ok.hello 
                                   # will become ok and hello.
                    assert word in self.doc , f'{word} not in Vocabulary'  # feel free to remove this assertion
                    try:
                        index = int(self.dico[word])
                        vector[index] +=1
                    except:
                        pass
            final_vector.append(list(vector))
        return final_vector
                    
        
    def fit(self, arr):
        '''
        fits docs and add new vocabularies
        '''
        unique = self.doc
        for text in arr:
            txt = text.split()
            unique = unique + list(set( txt ))
            
        self.doc  = sorted(list(set(unique)))
        self.dico = dict(np.c_[self.doc,np.arange(len(self.doc))])
        new_doc = self.purge_alph()
        self.update(new_doc,arr)
        return self
        
    

In [6]:
docs = np.array(['the sun is shining',
                'the weather is sweet',
                'the sun is shining, the weather is sweet, and one and one is two'])
test= Word2Vec()
test.fit(docs)
test.vocabulary()
test.to_array(docs)

'''
or 

'''
test= Word2Vec()
bag = test.fit(docs)
#bag.to_array()

None

## Now let's define some norms before designing tf-idf :

##### to make a normalization, we choose one norm and divide the elements of the array by its resulting value. the examples given  belows are just one dimensional, but we usually face cases with multidimensional matrices, then you will need to choose what axis to choose for normalization

l1 : given an array, we are adding up the absolute value of its element, and the resulting number is the l1 norm.

---example : $$\text{L1} ([1,2,3,4]) = |1| + |2| + |3| + |4| = 10$$

l2 : given an array, we are adding up the square of its element, and the square root of the resulting number is        the l2 norm.

-- example : $$\text{L2} ([1,2,3,4])= \sqrt{1^2 + 2^2 + 3^2 + 4^2} = \sqrt{30}$$

l2 : given an array, we are taking the maximum value of the absolute values of its elements

-- example : $$\text{maxnorm} ([1,2,3,-4])= max(1|,|2|,|3|,|-4|) = 4$$ 

In [7]:
class Normalizations:
    ''' 
    l1/l2/maxnorm normalizations. 
    Be aware, it doesn't catch zero division. you can improve it to catch zero vectors in the re_shape method helper.
    ''' 
    def __init__(self,matrix):
        self.matrix = np.array(matrix)        
    def re_shape(self,norm,axis):
        '''
        reshapes the norm to fit the matrix
        '''
        if axis ==1:
            resultant = np.repeat(norm,self.matrix.shape[axis]).reshape(self.matrix.shape)
        elif axis==0:
            resultant = np.repeat(norm,self.matrix.shape[axis]).reshape(self.matrix.shape[::-1]).T
        return resultant
    def l1_normalization(self,axis=1):
        '''
        returns l1-normalized matrix
        '''
        l1_values = np.sum(abs(self.matrix),axis)
        resultant = self.re_shape(l1_values,axis)
        l1 = self.matrix / resultant
        return l1
    def l2_normalization(self,axis =1):
        '''
        returns l2-normalized matrix
        '''
        l2_values = np.sqrt(np.sum(self.matrix**2,axis))
        resultant = self.re_shape(l2_values,axis)
        l2 = self.matrix / resultant
        return l2
    def maxnorm_normalization(self,axis =1):
        '''
        returns maxnorm-normalized matrix
        '''
        maxnorm_values = abs(self.matrix).max(axis)
        resultant = self.re_shape(maxnorm_values,axis)
        maxnorm = self.matrix / resultant
        return maxnorm
        

# Term Frequency and Inverse Document Frequency :

$ \text{raw term frequency tf(t,d)---number of time a term t occurs in a document d} $
$ \text{Inverse document frequency idf(t,d)---number of documents d that contains the term t} $

$ \text{ tf(t,d) = } \frac{n_t}{n_w}  \text{ ,with :} \bigg\{_{n_w \text{the total number of terms in the document}}^{n_t \text{count of t appears in a document}} $  
$ \text{idf(t,d) = } log_e(\frac{n_d}{n_{dt}})+1 \text{ ,with :} \bigg\{^{n_d \text{the total number of documents}}_{n_{dt} \text{the number of documents with term t in it}}$
$ \text{We are using the logarithm to ensure that low document frequencies are not given too much weight.} $

$ \text{ tfidf(t,d) = tf(t,d) * idf(t,d) }$

In [8]:
class tf_idf:
    def __init__(self,matrix,axis = 1):
        self.matrix = np.array(matrix)
        self.axis = axis

    def _Norm(self,tf,norm):
        '''
        Normalize the matrix with the chosen norm
        '''
        if norm == 'l2':
            tf = Normalizations(tf).l2_normalization()
        elif norm == 'l1':
            tf = Normalizations(tf).l1_normalization()
        elif norm == 'maxnorm':
            tf = Normalizations(tf).maxnorm_normalization()
        return tf
    def tfidf(self,norm = 'l2',idf = True,smooth = True):
        '''
        return matrix adjusted with tfidf
        '''
        tf = self.tf(norm,idf,smooth)
        if idf:
            tfidf = tf * self.idf(smooth)
        else:
            tfidf = tf
        tfidf = self._Norm(tfidf,norm)
        return tfidf
    def tf(self,norm = 'l2',idf = True,smooth = True):
        '''
        returns tf
        '''
        nw = self.matrix.shape[self.axis] #total number of terms in doc
        resultant = np.ones(self.matrix.shape) * nw
        tf = self.matrix / nw
        tf = self._Norm(tf,norm)
        return tf 
    
    def idf(self,smooth):
        '''
        returns idf
        '''
        nd = self.matrix.shape[1-self.axis] #total number of documents
        ndt = nd - np.sum(self.matrix == 0 ,1-self.axis) # number of documents with term t
        idf = np.log((nd)/(ndt)) +1 
        if smooth: # prevents zero divisions
            idf = np.log((nd+1)/(ndt+1)) +1 # we add on            
        if self.axis == 0:
            idf= np.repeat(idf,self.matrix.shape[1]).reshape(self.matrix.shape)
        return idf

In [9]:
test = tf_idf(bag.to_array(),1)
test.tfidf(idf = True,norm = 'l2',smooth = True)

array([[0.  , 0.43, 0.  , 0.56, 0.56, 0.  , 0.43, 0.  , 0.  ],
       [0.  , 0.43, 0.  , 0.  , 0.  , 0.56, 0.43, 0.  , 0.56],
       [0.5 , 0.45, 0.5 , 0.19, 0.19, 0.19, 0.3 , 0.25, 0.19]])

# Let's do some data cleaning : 
##### preprocessing the reviews to remove html tags,  and moving emoticons to the end.

In [10]:
def preprocessor(txt):
    '''
    light preprocessing of the text
    '''
    txt+= ' ' # adding space for the emogies not to stick to last word
    # removing html tags
    txt = re.sub('<[^>]*>',' ', txt) 

    # puting emoticons to the end and puting everything to lowercase
    em_reg = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
    emoticons = re.findall(em_reg,txt)
    txt = re.sub(em_reg ,' ', txt.lower() ) + ' '.join(emoticons)
    return txt
# let's test it
preprocessor('<p> hello :) :p :/ haha </p>')

'  hello       haha   :) :p :/'

In [11]:
porter = PorterStemmer()
# to improve the tokenizer, we will also be removing stopwords such as I,me...
nltk.download('stopwords') # downloading stopwords from nltk
from nltk.corpus import stopwords

def tokenizer_porter(text,stop = None):
    if type(stop) == type(None):
        return [porter.stem(word) for word in text.split()] 
    else :
        return [porter.stem(word) for word in text.split() if word not in stop] 
stop = stopwords.words('english')


[nltk_data] Downloading package stopwords to /home/elarbi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# let's test it
test = 'I know a swimmer who is swimming in a swimming pool full of swimmers'
print('Removing stop words : ' , tokenizer_porter(test,stop))
print('Not removing stop words : ' , tokenizer_porter(test))

Removing stop words :  ['I', 'know', 'swimmer', 'swim', 'swim', 'pool', 'full', 'swimmer']
Not removing stop words :  ['I', 'know', 'a', 'swimmer', 'who', 'is', 'swim', 'in', 'a', 'swim', 'pool', 'full', 'of', 'swimmer']


In [13]:
# applying the preprocessing on all of the reviews
def preprocessing(df,col = 'reviews'):
    df[col] = df[col].apply(preprocessor)
    df[col] = df[col].apply(lambda x : ' '.join(tokenizer_porter(x,stop)) ) #parsing reviews
preprocessing(reviews_train)    
reviews_train.head(3)

Unnamed: 0,reviews,sentiment
17065,"felt brain dead, i'll tell you. worst film eve...",0
14262,ghost story. cannib story. reveng story. poorl...,0
1369,"movi held interest, mainli diann keaton favori...",1


In [14]:
from IPython.display import clear_output

In [15]:
# let's put everything together
# the process below is my understanding of how tfidfVectorizer works
word2vec = Word2Vec() # defining the word2vec
rev_num = 0
rev_tot = reviews_train.shape[0]
X_mine = []

iteration = 2 # I only use few iterations to compare it with tfidfvectorizer because the computation is 
#heavy 
# first we have to preprocess then tokenize the reviews which has already been done above
#second we have to add the vocabulary to Word2vec
for review in reviews_train.reviews[:iteration]:
    rev_num +=1
    clear_output(wait = True)
    word2vec.fit([review]) # fitting word2vec with the parsed review
    print(f'{rev_num} out of {rev_tot}')
    print(f'{(rev_num/rev_tot) * 100}%')
#third we apply tfidf
rev_num = 0
for review in reviews_train.reviews[:iteration]:
    clear_output(wait = True)
    rev_num +=1
    tfidf_mine = tf_idf(word2vec.to_array(arr = [review]))
    X_mine.append(tfidf_mine.tfidf(idf = True,norm = 'l2',smooth = True) )
    print(f'{rev_num} out of {rev_tot}')
    print(f'{(rev_num/rev_tot) * 100}%')

2 out of 25000
0.008%


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents = False, lowercase = False, preprocessor = None,
                        tokenizer = tokenizer_porter, use_idf = True, norm = 'l2',
                       smooth_idf = True) 

y = reviews_train.sentiment.values.astype('int')
X_lib = tfidf.fit_transform(reviews_train.reviews[:iteration])

In [20]:
# let's compare my Word2vec and the tfidfVectorizer in terms of result (not performance and speed)
different = []
for voc in tfidf.get_feature_names():
    if voc not in word2vec.vocabulary().keys():
        different.append(voc)
print (f'vocabulary not in my word2vec : {different}') # those words are being removed from my word2vec because I
# defined a purge_alph method that gets rid all non-alpha characters in the words and removes the duplicates, 
#so technically those words are in my word2vec vocabulary, but without the punctuation
# Proof :
regex = re.compile('[^a-zA-Z]')
new_different = []
for word in different:
    new_words = regex.sub(' ', word).split()
    for new_word in new_words:
        if new_word not in word2vec.vocabulary().keys():
            new_different.append(new_word)
        
print (f'nbr of vocabulary not in my word2vec not taking into account punctuation: { new_different }') 
# you will usually find weird looking words that are basically that were tokenized differently by Stemmer (due to
# the fact that one had a special character and the other one didn't)

# the tf-idf resulting matrices will be a bit different because the vocabularies are not the same and the order is
# different too, but they both work well, but the speed 

vocabulary not in my word2vec : ['(in', 'acting.', 'blood;', 'bought.', 'by.', 'campers,', 'cheek,', 'confu', 'dead,', 'dire.', 'dull.', 'each,', 'evolution.', 'granted,', 'gratuitous,', 'here,', "i'll", "i'm", 'inten', 'it.', 'laughable.', 'minutes.', 'movie,', 'name).', 'place,', 'plague.', 'plot.', 'scenes.', 'seen.', 'story.', 'suppo', 'this.', 'time.', 'tribbiani,', 'violence.', 'wastebasket.', "what'", 'wife.', 'wilderness.', 'works.', 'you.']
nbr of vocabulary not in my word2vec not taking into account punctuation: ['confu', 'inten', 'suppo']


## I will obviously not be making it from scratch everytime I want to use it, but doing once helped me learn a great deal about how some of the features of CountVectorizer, TfidfTransformer, and TfidfVectorizer work.

# Let's use the TfidfVectorizer for the rest of this Sentimental Analysis

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents = False, lowercase = False, preprocessor = None,# already applied the prepro
                        tokenizer = tokenizer_porter, use_idf = True, norm = 'l2', # and lowered the characters
                       smooth_idf = True) 

y = reviews_train.sentiment.values.astype('int')
X = tfidf.fit_transform(reviews_train.reviews)

In [22]:
import pickle
openi = open('tfidf.sav','wb')
pickle.dump(tfidf, openi)
openi.close()

In [23]:
import json 
dico = tfidf.vocabulary_
for i in dico:
    dico[i] = int(dico[i]) # converting numpy int to int for json to recognize them
with open('vocabulary.json','w') as fp:
    json.dump(dico,fp)

In [24]:
import sklearn
sklearn.datasets.dump_svmlight_file(X,y,f = 'training_x_y.feat') # saving X and y as libsvm sparse matrixs

In [26]:
'''
X,y = sklearn.datasets.load_svmlight_file('training_x_y.feat') # loading them
'''

In [27]:
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv=5,
                          scoring = 'accuracy',
                          random_state = 7,
                          n_jobs = -1, # using all the processors 
                          verbose = 10,
                          max_iter = 300)
clf.fit(X,y)
filename = 'LR_model.sav'
check_ = open(filename,'wb')
pickle.dump(clf,check_)
check_.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 37.5min remaining: 56.2min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 40.3min remaining: 26.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 42.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 42.9min finished


In [51]:
'''
# loading the saved model
filename = 'LR_model.sav'
clf = pickle.load(open(filename,'rb'))
'''

In [28]:
# loading the test data
positive_dir_test = './test/pos/'
positive_reviews = review_extraction(positive_dir_test)

negative_dir_test = './test/neg/'
negative_reviews_test = review_extraction(negative_dir_test)

reviews0_test = pd.DataFrame(np.c_[negative_reviews,np.zeros(len(negative_reviews),dtype = 'uint8')],
                        columns = ['reviews','sentiment'])

reviews1_test = pd.DataFrame(np.c_[positive_reviews,np.ones(len(negative_reviews),dtype = 'uint8')],
                        columns = ['reviews','sentiment'])

reviews_test = reviews1_test.append(reviews0_test,ignore_index = True)
preprocessing(reviews_test)    

y_test = reviews_test.sentiment.values.astype('int')
X_test = tfidf.transform(reviews_test.reviews)

In [29]:
# evaluating the model
print(f'accuracy : {round(clf.score(X_test,y_test),2)} ')

accuracy : 0.94 


In [31]:
# write whatever comment you want and see whether the machine thinks it's a good or bad comment
# write your comment in text_test
text_test = "I really liked the film, especially when the hero saved the girl,\
            for the rest the scenario was banal."
text_test = preprocessor ( text_test )
text_test = tokenizer_porter(text_test,stop)

xx_test = tfidf.transform(text_test)
prediction = np.mean(clf.predict(xx_test))
if prediction <0.5:
    print('bad review')
elif prediction>.5:
    print('good review')
else:
    print('neutral')

good review
