# Analisis de sentimientos en Twitter

Este programa entrena un modelo a partir de tres conjuntos de entrenamiento de 230 casos para tweets positivos, negativos y neutros. Posteriormente, pide una query con la que consulta Twitter y obtiene 100 tweets que analiza y entrega un resultado del analisis de sentimientos.

In [48]:
import nltk
import re
import string
import math
import numpy as np # array processing for numbers, strings, records, and objects.
import random # generate pseudo random numbers
import tweepy # twitter
import urllib3 # http client
import warnings
warnings.filterwarnings("ignore")

from sklearn import preprocessing # utility functions and transformer classes
from sklearn.preprocessing import StandardScaler # transforma raw data
from sklearn.linear_model import LogisticRegression # regresion logistica
from sklearn.neighbors import KNeighborsClassifier # kneighbors
from sklearn.svm import SVC # support vector machine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# carga de 230 ejemplos de cada caso
TWEETS_NEG = '/home/dgomez/comp/machine_learning/proyectos/sentiment/TWEETS_NEGATIVO'
TWEETS_NEU = '/home/dgomez/comp/machine_learning/proyectos/sentiment/TWEETS_NEUTRO'
TWEETS_POS = '/home/dgomez/comp/machine_learning/proyectos/sentiment/TWEETS_POSITIVO'

# carga de credenciales de twitter
file_keys = open("/home/dgomez/comp/machine_learning/proyectos/sentiment/TWITTER_KEYS", "r") 
line = file_keys.readline(); CONSUMER_KEY        = line.split(':')[1][1:-2]
line = file_keys.readline(); CONSUMER_SECRET     = line.split(':')[1][1:-2]
line = file_keys.readline(); ACCESS_TOKEN        = line.split(':')[1][1:-2]
line = file_keys.readline(); ACCESS_TOKEN_SECRET = line.split(':')[1][1:-2]

NLTK_DIR='/home/dgomez/comp/machine_learning/proyectos/sentiment/sentiment_web/nltk_data'
nltk.data.path.append(NLTK_DIR)

LALA


In [46]:
# normalizar texto de los tweets
def normalize(text):
    text = text.lower()
    #text = text.replace('á', 'a')
    text = re.sub('á', 'a', text)
    text = re.sub('é', 'e', text)
    text = re.sub('í', 'i', text)
    text = re.sub('ó', 'o', text)
    text = re.sub('ú', 'u', text)
    text = re.sub('ü', 'u', text)
    text = re.sub('Á', 'a', text)
    text = re.sub('É', 'e', text)
    text = re.sub('Í', 'i', text)
    text = re.sub('Ó', 'o', text)
    text = re.sub('Ú', 'u', text)
    text = re.sub('ñ', 'n', text)
    text = re.sub('Ñ', 'n', text)
    text = re.sub('ª', 'a', text)
    text = re.sub('°', 'o', text)
    text = re.sub('http[s]*://[^ ]+', '', text)
    text = re.sub('#|\(|\)|-|`|"', '', text)
    text = re.sub("[^a-zA-Z ]", ' ', text)
    return text

# separar las tokens de un texto
def tokenizar(text):
    #tokens = text.split(' ')
    tokens = nltk.word_tokenize(text)
    return tokens

# aplicar stemming (reduccion a la forma base) de un texto
def stemming(token):
    sno = nltk.stem.SnowballStemmer('spanish')
    new_token = sno.stem(token).encode("utf-8")
    return new_token

# entregar los tokens, aplicando: normalizacion, tokenizado, stopwords, stemming
def words_doc(text):
    #new_text = text.translate(None, string.punctuation)
    new_text = normalize(text)
    tokens = tokenizar(new_text)#[0:-1])
    stop = set(stopwords.words('spanish'))
    filtradas = [i for i in tokens if i not in stop]
    tokens_text = []
    for token in filtradas:
        new_token = stemming(token)
        tokens_text.append(new_token)
    return tokens_text

# entrega el vector asociado a un set de palabras (tokens_doc) en una coleccion (dictionary)
def vectorization(tokens_doc, dictionary, N):
    nwords = len(dictionary)
    vector = {}
    vector_final = []
    for k in range(nwords):
        vector[k] = 0
    for token in tokens_doc: # cada palabra en este documento
        freq     = tokens_doc.count(token)
        if token in dictionary.keys():
            freq_all = dictionary.get(token)
            tf_idf = freq * math.log(N / (1.0 * freq_all))
            k = 0
            for token2 in dictionary: # cada palabra del diccionario
                if token == token2:
                    vector[k] = round(tf_idf, 2)
                k += 1
        else:
            tf_idf = 0
    for item in vector:
        vector_final.append(vector.get(item))
    return vector_final

In [47]:
docs          = []
classes       = []
vectors       = []
vectors_train = []
vectors_test  = []
classes_train = []
classes_test  = []
urllib3.disable_warnings()

# lectura de los archivos de textos/documentos
f_neg = open(TWEETS_NEG, "r")
f_neu = open(TWEETS_NEU, "r")
f_pos = open(TWEETS_POS, "r")
k = 0
while True:
    line = f_neg.readline()
    if not line: break
    list = line.split('|')
    docs.append(list[0])
    classes.append(list[1][0:-1])
    k += 1
print "tweets negativos: " + str(k),
k = 0
while True:
    line = f_neu.readline()
    if not line: break
    list = line.split('|')
    docs.append(list[0])
    classes.append(list[1][0:-1])
    k += 1
print "| tweets neutros: " + str(k),
k = 0
while True:
    line = f_pos.readline()
    if not line: break
    list = line.split('|')
    docs.append(list[0])
    classes.append(list[1][0:-1])
    k += 1
print "| tweets positivos: " + str(k)

# extraccion de tokens de la coleccion
dictionary = {}
tokens_docs = []
for doc in docs:
    tokens_doc = words_doc(normalize(doc))
    for token in tokens_doc:
        if dictionary.get(token) != None:
            dictionary[token] += 1
        else:
            dictionary[token] = 1
    tokens_docs.append(tokens_doc)

# guardar diccionario
file = open('dictionary.csv', 'w')
for token in sorted(dictionary, key=dictionary.get, reverse=True):
    file.write(token + ',' + str(dictionary.get(token)) + '\n')

# conformacion de la dimensionalidad de la coleccion
N = len(docs)
kdoc = 0
for tokens_doc in tokens_docs: # cada documento [sus tokens]
    vector = vectorization(tokens_doc, dictionary, N)
    vectors.append(vector)
    kdoc += 1
print "|dictionary|: " + str(len(dictionary))

# separacion de conjuntos de train y test
percentage = 0.8
for k in range(len(vectors)):
    if random.random() < percentage:
        vectors_train.append(vectors[k])
        classes_train.append(classes[k])
    else:
        vectors_test.append(vectors[k])
        classes_test.append(classes[k])
print "|train|: "+ str(len(vectors_train))+ ", |test|: "+ str(len(vectors_test))

# preprocesar: convertir clases a numeros
label_e = preprocessing.LabelEncoder()
label_e.fit(classes)
classes_encoded = label_e.transform(classes)
classes_train_encoded = label_e.transform(classes_train)
classes_test_encoded  = label_e.transform(classes_test)

# preparacion modelo clasificador
#classifier = KNeighborsClassifier(3)
#classifier = SVC(kernel="linear", C=0.025)
classifier = LogisticRegression(
    solver='lbfgs', max_iter=1000, multi_class='multinomial')
classifier.fit(vectors_train, classes_train_encoded)

print "SCORE: " + str(round(classifier.score(vectors_test, classes_test_encoded), 2))
print label_e.inverse_transform([0, 1, 2])
print 'train ok.'

# iteracion: prediccion de textos
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

maxtweets = 100
#while True:
print "query: ",
query = raw_input()
if len(query) > 0:
    query += ' -RT'
    search = api.search(q=query, count=maxtweets)
    count_neg = 0
    count_neu = 0
    count_pos = 0
    for tweet in search:
        tokens = words_doc(tweet.text)
        vector = vectorization(tokens, dictionary, N)
        this_class = label_e.inverse_transform(
            classifier.predict(np.array(vector).reshape(1, -1)))
        #print this_class + " : " + tweet.text.encode("utf-8").replace('\n', '')
        print ".",
        if this_class == 'NEGATIVO':
            count_neg += 1
        if this_class == 'NEUTRO':
            count_neu += 1
        if this_class == 'POSITIVO':
            count_pos += 1

    print
    print "NEGATIVO: " + str(100 * count_neg / (maxtweets * 1.0)) + "%"
    print "NEUTRO: "   + str(100 * count_neu / (maxtweets * 1.0)) + "%"
    print "POSITIVO: " + str(100 * count_pos / (maxtweets * 1.0)) + "%"

tweets negativos: 230 | tweets neutros: 225 | tweets positivos: 229
|dictionary|: 2640
|train|: 548, |test|: 136
SCORE: 0.66
['NEGATIVO' 'NEUTRO' 'POSITIVO']
train ok.
query: felipe kast
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
NEGATIVO: 26.0%
NEUTRO: 8.0%
POSITIVO: 49.0%
