# Stemming y tokenización con NLTK

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cperales/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/cperales/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/cperales/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from pprint import pprint
from random import shuffle
from string import punctuation
import numpy as np

In [3]:
#import nltk.classify.util
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk import FreqDist
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

## Caracterización del corpus

In [4]:
print('Categorias del corpus =', movie_reviews.categories())
print('Total de reviews =', len(movie_reviews.fileids()))
print('Cantidad de reviews positivas =', len(movie_reviews.fileids('pos')))
print('Cantidad de reviews negativas =', len(movie_reviews.fileids('neg')))

Categorias del corpus = ['neg', 'pos']
Total de reviews = 2000
Cantidad de reviews positivas = 1000
Cantidad de reviews negativas = 1000


Necesitamos filtrar el corpus, aplicando stopwords y stemming

In [5]:
stopwords_english = stopwords.words('english')
english_stemmer = SnowballStemmer('english')

In [6]:
def feature_generator(cat, final_label = None):
    if final_label is None:
        final_label = cat
    words = []
    ids = movie_reviews.fileids(cat)
    for id_ in ids:
        for word in movie_reviews.words(id_):
            if word not in stopwords_english and word not in punctuation:
                words.append(({'word': english_stemmer.stem(word.lower())}, final_label))
    return words

Filtramos el corpus aplicando stopwords en inglés

In [7]:
pos_words = feature_generator('pos', 1)
neg_words = feature_generator('neg', 0)

Construimos el dataset, y lo dividimos en entrenamiento y test

In [8]:
whole_set = pos_words + neg_words
shuffle(whole_set)
print('El set completo tiene', len(whole_set), 'elementos')
prop = int(0.5 * len(whole_set))
train_set, test_set = whole_set[:prop], whole_set[prop:]
print('El set de test tiene', len(test_set), 'elementos')
print('El set de train tiene', len(train_set), 'elementos')

El set completo tiene 710578 elementos
El set de test tiene 355289 elementos
El set de train tiene 355289 elementos


Ya podemos clasificar y testear el dataset

In [25]:
classifier = NaiveBayesClassifier.train(train_set)
print('La precisión del método es', accuracy(classifier, test_set))
print('La precisión del método es', accuracy(classifier, train_set))
classifier = NaiveBayesClassifier.train(whole_set)

La precisión del método es 0.5705073897587598
La precisión del método es 0.6094278179172449


In [26]:
classifier.show_most_informative_features(5)

Most Informative Features
                    word = 'mulan'             1 : 0      =     57.4 : 1.0
                    word = 'flynt'             1 : 0      =     47.3 : 1.0
                    word = 'seagal'            0 : 1      =     33.4 : 1.0
                    word = 'lebowski'          1 : 0      =     33.0 : 1.0
                    word = 'webb'              0 : 1      =     22.8 : 1.0


In [27]:
def feature_extractor(text):
    words = [{'word': english_stemmer.stem(word.lower())} for word in word_tokenize(' '.join(text.split()))
             if word not in stopwords_english and word not in punctuation]
    return words

In [28]:
custom_review = "I hated the film. It was a disaster. It has poor direction and bad acting."
custom_review_tokens = feature_extractor(custom_review)
print(custom_review_tokens)

[{'word': 'i'}, {'word': 'hate'}, {'word': 'film'}, {'word': 'it'}, {'word': 'disast'}, {'word': 'it'}, {'word': 'poor'}, {'word': 'direct'}, {'word': 'bad'}, {'word': 'act'}]


In [29]:
import pymongo
import datetime

In [30]:
client = pymongo.MongoClient("localhost", 27017)  # Qué significa localhost
db = client.tweets
collection = db.movie

In [31]:
tweet_text = [tweet['text'] for tweet in collection.find()]

In [32]:
tweet = tweet_text[5]
print(tweet)

Mahogany Teakwood candle lit (my absolute favorite scent), Fantastic Beasts on, wine poured. After the past few cra… https://t.co/dHKac5WPw5


In [33]:
features_tweet = feature_extractor(tweet)
print(features_tweet)

[{'word': 'mahogani'}, {'word': 'teakwood'}, {'word': 'candl'}, {'word': 'lit'}, {'word': 'absolut'}, {'word': 'favorit'}, {'word': 'scent'}, {'word': 'fantast'}, {'word': 'beast'}, {'word': 'wine'}, {'word': 'pour'}, {'word': 'after'}, {'word': 'past'}, {'word': 'cra…'}, {'word': 'https'}, {'word': '//t.co/dhkac5wpw5'}]


In [34]:
np.mean(classifier.classify_many(features_tweet))

0.6875

In [35]:
tweets_reviewed = []
for tweet in collection.find():
    tweet_text = tweet['text']
    features_tweet = feature_extractor(tweet_text)
    value = np.mean(classifier.classify_many(features_tweet))
    tweet['sentiment'] = value
    tweets_reviewed.append(tweet)

In [21]:
collection_2 = db.movie_reviews

In [36]:
for tweet in tweets_reviewed:
    try:
        collection_2.save(tweet)
    except:
        id_conflict = tweet.pop('_id')
        collection.update_one({'_id': id_conflict},
                              {'$set': tweet},
                              upsert=False)

  This is separate from the ipykernel package so we can avoid doing imports until
