# Extraordinario (Parte 2)
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

**Instrucciones:**
   - Analizar las reseñas de móviles y obtener aspectos importantes
   - Realizar análisis de polaridad sobre 5 aspectos
   - Generar un resúmen de las opiniones

In [51]:
import gensim
import math
import nltk
import numpy as np
import spacy
import os
import xml.dom.minidom

from gensim import corpora
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
def cleanHtml(txt):
    from bs4 import BeautifulSoup
    return BeautifulSoup(txt,'lxml').get_text().lower()

In [19]:
def cleanText(txt):
    return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ').replace('\n', ' ').replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o').replace('ú', 'u')

In [20]:
def deleteTrash(txt):
    import re
    good = {'\n'}
    for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
        good.add(i)
    ans = ""
    for c in txt:
        if c in good:
            ans += c
    return ans

In [21]:
def splitToSentences(txt):
    tokenizer = nltk.data.load('nltk:tokenizers/punkt/spanish.pickle')
    return tokenizer.tokenize(txt)

In [22]:
def deleteStopwords(txt):
    from nltk.corpus import stopwords
    ans = []
    stp = stopwords.words('spanish')
    for w in txt:
        if w not in stp:
            ans.append(w)
    return ans

In [23]:
def getVocabulary(sentences):
    tokens = set()
    for sent in sentences:
        for token in sent:
            tokens.add(token)
    return sorted(tokens)

In [24]:
def getFrequencies(sentences, vocabulary):
    tokens = dict()
    
    for token in vocabulary:
        tokens[token] = 0
    
    for sent in sentences:
        for token in sent:
            tokens[token] += 1
            
    return tokens

In [25]:
def replaceWithLemmas(tokens):
    lemma = WordNetLemmatizer()
    lemmatized_tokens = []
    for word in tokens:
        lemmatized_tokens.append(lemma.lemmatize(word))
    return lemmatized_tokens

In [26]:
def getTagger():
    patterns = [
        (r'.*o$', 'n'),  # noun masculine singular
        (r'.*os$', 'n'), # noun masculine plural
        (r'.*a$', 'n'),  # noun femenine singular
        (r'.*as$', 'n')  # noun femenine plural
    ]
    regexTagger = nltk.RegexpTagger(patterns, nltk.DefaultTagger('s'))
    unigramTagger = nltk.UnigramTagger(nltk.corpus.cess_esp.tagged_sents(), None, regexTagger)
    return unigramTagger

In [27]:
def mixTags(tokens, tokenTags):
    taggedTokens = list()
    for i in range(0,len(tokens)):
        taggedTokens.append((tokens[i], tokenTags[i][1]))
    return taggedTokens

In [28]:
def normalizeSencences(sent):
    tagger = getTagger()
    sentences = []
    for s in sent:
        cleanSentence = deleteTrash(cleanText(s))
        normalizedTokens = deleteStopwords(nltk.word_tokenize(cleanSentence))
        tokenTags = tagger.tag(normalizedTokens)
        stematizedTokens = replaceWithLemmas(normalizedTokens)
        tokens = mixTags(stematizedTokens, tokenTags)
        sentences.append(tokens)
    return sentences

In [29]:
def normalizeSencencesSpacy(sent):
    tagger = getTagger()
    nlp = spacy.load("es_core_news_sm")
    sentences = []
    
    for s in sent:
        cleanSentence = deleteTrash(cleanText(s))
        normalizedTokens = nlp(cleanSentence)
        lemmatizedTokens = [token.lemma_ for token in normalizedTokens if not token.lemma_.isspace()]
        noStopwordsTokens = deleteStopwords(lemmatizedTokens)
        tokenTags = tagger.tag(noStopwordsTokens)
        tokens = mixTags(lemmatizedTokens, tokenTags)
        sentences.append(tokens)
    return sentences

In [30]:
def loadText():
    folder = "./extraordinario/movil"
    lines = list()
    for file in os.listdir(folder):
        with open(os.path.join(folder, file)) as f:
            lines.append(f.read())
    return "\n".join(lines)

In [31]:
org_txt = loadText()
sentences = splitToSentences(org_txt)
sentences = normalizeSencences(sentences)

In [32]:
vocabulary = getVocabulary(sentences)
frequencies = getFrequencies(sentences, vocabulary)

## Obtener Aspectos

In [33]:
nouns = []
for token, count in frequencies.items():
    if token[1][0] == 'n':
        nouns.append((count, token[0]))
nouns.sort(reverse=True)
aspectos = [noun[1] for noun in nouns[:20]]

In [34]:
for aspecto in aspectos:
    print(aspecto)

ma
telefono
bateria
dia
okia
pantalla
fotos
n
camara
calidad
memoria
tiempo
vez
habia
manos
radio
precio
tienda
color
tipo


In [36]:
aspectos = ['bateria', 'pantalla', 'fotos', 'calidad', 'memoria']

In [41]:
def getPolarities(file):
    ans = {}
    doc = xml.dom.minidom.parse(file)
    layers = doc.getElementsByTagName("senticon")[0].getElementsByTagName("layer")
    for layer in layers:
        positive = layer.getElementsByTagName("positive")[0].getElementsByTagName("lemma")
        negative = layer.getElementsByTagName("negative")[0].getElementsByTagName("lemma")
        for lemma in positive + negative:
            ans[lemma.firstChild.nodeValue.rstrip().lstrip().lower().replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o').replace('ú', 'u').replace('ñ', 'n').replace('ü', 'u')] = float(lemma.getAttribute("pol"))
    return ans

In [43]:
polarities = getPolarities('./extraordinario/sent/senticon.es.xml')

In [47]:
def countAspect(aspect, sentence):
    count = 0
    for token in sentence:
        if token[0] == aspect:
            count += 1
    return count

In [48]:
context = [[] for i in range(len(aspectos))]
for i in range(len(aspectos)):
    aspecto = aspectos[i]
    for sentence in sentences:
        if countAspect(aspecto,sentence):
            context[i].append(sentence)

In [52]:
avg = np.zeros(len(aspectos))
cnt = np.zeros(len(aspectos))
for i in range(len(aspectos)):
    for sentence in context[i]:
        for word in sentence:
            if word[0] in polarities:
                avg[i] += polarities[word[0]]
                cnt[i] += 1
avg /= cnt

In [55]:
print(F"{'Aspecto':{11}}Probabildiad Promedio")
for i in range(len(aspectos)):
    print(F"{aspectos[i]:{11}}{avg[i]}")

Aspecto    Probabildiad Promedio
bateria    0.17302145922746787
pantalla   0.24455952380952384
fotos      0.3049433962264151
calidad    0.23509271523178804
memoria    0.25182000000000004


## Polaridad

In [56]:
def getSentiments(sentiments, file):
    with open(file) as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = line.rstrip('\n').split('\t')
            sentiment = line[0]
            tag = line[-1]
            sentiments[sentiment] = (1 if tag == 'pos' else 0)

In [57]:
sentiments = dict()
getSentiments(sentiments, './extraordinario/sent/fullStrengthLexicon.txt')
getSentiments(sentiments, './extraordinario/sent/mediumStrengthLexicon.txt')