In [10]:
import enum 
import tqdm 
from typing import Generator, Dict, List, Tuple 
import pickle 
import math 
import random 
from gensim.parsing.preprocessing import remove_stopword_tokens 
from gensim.models import Phrases 
from gensim.models.phrases import Phraser 
from gensim.utils import simple_preprocess 
from gensim.corpora import Dictionary 
import numpy as np 
from nltk.corpus import stopwords 
import spacy 
from spacy.lang. en import English 
from datetime import datetime 
from collections import defaultdict 
import pandas as pd

In [11]:
nlp = spacy.load('en_core_web_trf')
class sentiments(enum.Enum):
    POS='POS'
    NEG='NEG'

def split_data(data: List,weights: List -(0.7,0.15,0.15)):
    split={
        'train':[],
        'test':[],
        'validation':[],
    }
    for word in data:
        subset = random.choices(['train','test','validation'],weights=weights)[0]
        split[subset].append(word)

    return split
def sentences_to_words(sentences: List[str]) -> List[List[str]]:
    words =[]
    pbar=tqdm.tqdm(range(len(sentences)))
    pbar.set_description('Sentences to words')
    for i in pbar:
        words.append(simple_reprocess(str(sentences[i]),deacc=True))
    return words
def remoce_stopwords(documents: List[List[str]]) -> List[List[str]]:
    pbar = tqdm.tqdm(range(len(documents)))
    pbar.set_description('Remove StopWords')
    docs=[]
    for i in pbar:
        docs.append(remove_stopword_tokens(documents[i],stopwords=stopwords.words('english')))
    
    return docs
def create_ngrams(ngram_model_lst, documents: List[List[str]]):
    pbar = tqdm.tqdm(range(len(ngram_model_lst)))
    pbar.set_description('Create N-grams')
    for i in pbar:
        documents=[ngram_model_lst[i][doc] for doc in documents]
    return documents
def learn_ngrams(n,min_c, th, documents: List[List[str]]) -> List[List[str]]:
    print('Learning N-grams')
    ngram_model_lst = []
    ngram = Phrases(documents, min_count= min_c, threshold = th )
    ngram_mod = Phraser(ngram)
    ngram_model_lst.append(ngram_mod)
    if n >2:
        for i in range(3,n+1):
            documents = [ngram_model_lst[i-3][doc]for doc in documents]
            ngram_model_lst.append(Phraser(Phrases(documents, min_count = min_c, threshold = th)))
    return ngram_model_lst

def lemma(nlp: English, texts: List[List[str]], allowed_postags: List = None) -> List[List[str]]:
    if allowed_postags is None:
        allowed_postags = ['NOUN','ADJ','VERB','ADV']
        texts_out = []
        pbar = tqdm.tqdm(range(len(texts)))
        pbar.set_description('Lemmatization')
        for i in pbar:
            doc = nlp(" ".join(texts[i]))
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def tokenize(stopwords, lemmatization, ngrams, documents: List[str], ngram_model_lst) -> List[List[str]]:
    document_words = sentences_to_words(documents)
    if lemmatization:
        document_words = lemma(nlp, document_words)
    if stopwords:
        document_words = remove_stopwords(document_words)
    if ngrams:
        document_words = create_ngrams(ngram_model_lst, document_words)
    return document_words

def train_tokenize(stopwords, lemmatization,documents: List[str]) -> List[List[str]]:
    document_words = sentences_to_words(documents)
    if lemmatization:
        document_words= lemma(nlp,document_words)
    if stopwords:
        document_words=remove_stopwords(document_words)
        
    return document_words

def create_dictionary(documents: List[List[str]]):
    return Dictionary(documents)

class NB_sentiment_analysis():
    def _init_(self,**kwargs):
        self.stopwords=kwargs.get('stopwords',True)
        self.ngrams = kwargs.get('ngrams',True)
        self.n = kwargs.get('n',2)
        self.ngram_mincount = kwargs.get('ngram_min_count',5)
        self.ngram_th = kwargs.get('ngram_th',10)
        self.lemmatization = kwargs.get('lemmatization',True)
        self.ngram_model_lst=[]
        self.is_trained= False
    
    def fit(self, train_POS,train_NEG,verbose=False):
        print(datetime.now())
        print('Negative class tokenization')
        tokenized_NEG = train_tokenize(self.stopwords, self.lemmatization, train_NEG)
        print('Positive class Tokenization')
        tokenized_POS = train_tokenize(self.stopwords,self.lemmatization, train_POS)
        
        if self.ngrams:
            self.ngram_model_lst = learn_ngrams(self.n,self.ngram_mincount, self.ngram_th, tokenized_POS+tokenized_NEG)
            print('Negative class N-grams')
            tokenized_NEG =create_ngrams(self.ngram_model_lst,tokenized_NEG)
            print('Positive class N-grams')
            tokenized_POS = create_ngrams(self.ngram_model_lst,tokenized_POS)
            
        positive_words = [item for sublist in tokenized_POS for item in sublist]
        negative_words = [item for sublist in tokenized_POS for item in sublist]
        self.dictionary = create_dictionary([negative_words,positive_words])
        positive_bow = self.dictionary.doc2bow(positive_words)
        negative_bow = self.dictionary.doc2bow(negative_bow)
        total_negative_words = len(negative_words)+ len(self.dictionary)
        total_positive_words = len(positive_words)+ len(self.dictionary)
        self.negative_word_probs={}
        for id, count in negative_bow:
            self.negative_word_probs[self.dictionary[id]] = {
                'id': id,
                'logprob': np.log((count+1)/total_negative_words),
            }
        self.negative_word_probs = defaultdict(lambda: {'id': -1, 'logprob': np.log(1/total_negative_words)}, self.negative_word_probs)
        self.positive_word_probs={}
        
        for id, count in positive_bow:
            self.positive_word_probs[self.dictionary[id]] = {
                'id': id,
                'logprob': np.log((count+1)/total_positive_words),
            }
        self.positive_word_probs = defaultdict(lambda: {'id': -1, 'logprob': np.log(1/total_positive_words)}, self.positive_word_probs)

        
        
        self.negative_prob = np.log(len(negative_words)/(len(negative_words)+len(positive_words)))
        self.positive_prob = np.log(len(positive_words)/(len(negative_words)+len(positive_words)))
        self.is_trained=True
        print(datetime.now())
        
    def predict(self,document):
        doc = [document]
        tokenized_doc = tokenize(self.stopwords,self.lemmatization,self.ngrams,doc,self.ngram_model_lst)
        pos_prob = self.positive_prob
        neg_prob = self.negative_prob
        for token in tokenized_doc[0]:
            pos_prob+= self.positive_word_probs[token]['logprob']
            neg_prob+= self.positive_word_probs[token]['logprob']
        
        if pos_prob > neg_prob:
            sentiment = Sentiments.POS
        else:
            sentiment = Sentiments.NEG
            
        return sentiment
    
    def val_test(self,pos,neg):
        tokenized_POS = tokenize(self.stopwords,self.lemmatization,self.ngrams, pos,self.ngram_model_lst)
        tokenized_NEG = tokenize(self.stopwords,self.lemmatization,self.ngrams, neg,self.ngram_model_lst)
        tp=0
        fp=0
        tn=0
        fn=0
        pos_prob=self.positive_prob
        neg_prob=self.negative_prob
        pbar=tqdm.tqdm(range(len(tokenized_POS)))
        pbar.set_description('Positive Validation/Test')
        for i in pbar:
            for token in tokenized_POS[i]:
                pos_prob+=self.positive_word_probs[token]['logprob']
                neg_prob+=self.positive_word_probs[token]['logprob']
            if pos_prob > neg_prob:
                tp+=1
            else:
                fn+=1
        pbar=tqdm.tqdm(range(len(tokenized_NEG)))
        pbar.set_description('Negative Validation/Test')
        for i in pbar:
            for token in tokenized_NEG[i]:
                pos_prob+=self.positive_word_probs[token]['logprob']
                neg_prob+=self.positive_word_probs[token]['logprob']
            if pos_prob > neg_prob:
                fp+=1
            else:
                tn+=1
        acc=(tp+tn)/(tp+tn+fp+fn)
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1= 2*precision*recall/(precision+recall)
        fpr= fp/(fp+tn)
        fnr = fn/(tp+fn)
        self.metrics={'accuracy':acc,'precision':precision,'recall':recall,'F1':f1,'False Positive Rate': fpr,'False Negative Rate':fnr}
        return tp,tn,fp,fn

ValueError: [E002] Can't find factory for 'transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, ner, beam_ner, entity_ruler, tagger, morphologizer, senter, sentencizer, textcat, spancat, future_entity_ruler, span_ruler, textcat_multilabel, en.lemmatizer

Carga de Datos

In [3]:
df= pd.read_csv('review.csv')
df['rating'] = df['rating'].astype(dtype='int64')
df['sentiment']=df['rating'].apply(lambda x: Sentiments.POS if x >= 30 else Sentiments.NEG)
review_classes = {sentiment.value: df[df['sentiment']== sentiment]['review'].values.tolist() for sentiment in Sentiments}
positive_reviews = review_classes['POS']
negative_reviews = review_classes['NEG']
split_neg = split_data(negative_reviews)
split_pos = split_data(positive_reviews)

NameError: name 'pd' is not defined

Entrenamiento de modelo

In [None]:
model=NB_sentiment_analysis()
model.fit(split_pos['train'],split_neg['train'])

In [None]:
model.val_test(split_pos['validation'],split_neg['validation'])

In [None]:
model.metrics

In [None]:
sentence = 'Good location on the NW of town, especially if youre going to Antigua spanish school. 100q for good sized private'
model.predict(sentence)

In [None]:
negative_reviews[100]