# TCC Augusto e Ícaro
## Modelo de automatização das Heurísticas de Nielsen para comentários em reviews de Apps

* Versão 0.3.0
* Bibliotecas utilizadas: pandas, numpy, texthero, ntlk e corpus do ntlk em português
* Dataset utilizado: dataset_v9.csv
* Data: 04/07/2020

### Objetivos, incrementos e correções:

* Encapsulamento da pipeline de pre-processamento
* Utilizando dataset com classificação revisada para melhor acurácia
* Aumento dos pesos de termos classificados como usabilidade
* Foco em detalhar e evoluir o dicionário de radicais classificados

In [1]:
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install texthero 
!{sys.executable} -m pip install nltk

Collecting pip
  Downloading pip-20.2.1-py2.py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 1.0 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.2
    Uninstalling pip-20.2:
      Successfully uninstalled pip-20.2
Successfully installed pip-20.2.1




In [2]:
import pandas as pd
import texthero as hero
import nltk
from nltk.corpus import stopwords
import numpy as np
# nltk.download()

## Pré-processamento

In [433]:
DROPLIST_FILENAME='droplist.0.2.0.txt'
MODEL_FILENAME='model_v0.2.0.json'
DATASET_FILENAME='reviews_v9.csv'

In [434]:
def pre_process(df_path, df_cols):
    df = pd.read_csv(df_path, index_col=False, usecols=df_cols)
    df = df[df['is_classified']==True]
    df = df.rename(columns={'Text': 'text'})
    df['text'] = hero.preprocessing.clean(df['text'])
    pt_stopwords = stopwords.words('portuguese')
    df['text'] = hero.remove_stopwords(df['text'], stopwords=pt_stopwords)
    df['text'] = hero.stem(df['text'], language='portuguese')
    df['text'] = hero.tokenize(df['text'])
    df['class_name'] = df.apply(lambda row: 'usability' if row['is_usability'] else 'not_usability', axis=1)
    df = df.drop('is_usability', 1)
    df = df.to_dict('records')
    return df    

In [485]:
def split_train_test(df, train_size):
    
    df_usa = [item for item in df if item['class_name'] == 'usability']
    df_not_usa = [item for item in df if item not in df_usa]
    
    train_usa, test_usa = np.split(df_usa, [int(len(df_usa)*train_size)])
    train_not_usa, test_not_usa = np.split(df_not_usa, [int(len(df_not_usa)*train_size)])
    
    return np.concatenate([train_usa,train_not_usa]), np.concatenate([test_usa,test_not_usa])

In [486]:
df = pre_process(DATASET_FILENAME, ['ID', 'Text', 'is_usability', 'is_classified'])
train_dataset, test_dataset = split_train_test(df, train_size=0.7)

In [487]:
print(f'train: {len(train_dataset)} test: {len(test_dataset)}')

train: 650 test: 280


In [488]:
print(f'[info] total usability train data {sum(value["class_name"] == "usability" for value in train_dataset)}')
print(f'[info] total not_usability train data {sum(value["class_name"] == "not_usability" for value in train_dataset)}')
print(f'[info] total usability test data {sum(value["class_name"] == "usability" for value in test_dataset)}')
print(f'[info] total not_usability test data {sum(value["class_name"] == "not_usability" for value in test_dataset)}')

[info] total usability train data 497
[info] total not_usability train data 153
[info] total usability test data 214
[info] total not_usability test data 66


## Classificador

In [489]:
def train(training_data):
    corpus_words = {}
    for data in training_data: 
        class_name = data['class_name']
        frase = data['text']
        if class_name not in list(corpus_words.keys()):
            corpus_words[class_name] = {}
        for word in frase:
            if word not in list(corpus_words[class_name].keys()):
                corpus_words[class_name][word] = 1
            else:
                corpus_words[class_name][word] += 1
    return corpus_words

In [490]:
def drop_low_score(corpus_words, min_value, by_key):
    fitered_dict = {}
    for k in corpus.keys():
        if by_key:
            if k == by_key:
                fitered_dict.update({k: { key:value for (key,value) in corpus[k].items() if value > min_value }})
            else:
                fitered_dict.update({k: { key:value for (key,value) in corpus[k].items() }})
        else:
            fitered_dict.update({k: { key:value for (key,value) in corpus[k].items() if value > min_value }})
    return fitered_dict

In [491]:
def classificate(corpus, sentence):
    def calculate_class_score(corpus_words, sentence, class_name):
        score = 0 
        for word in sentence:
            if word in corpus_words[class_name]:
                score += corpus_words[class_name][word]
        return score
    classifications = []
    for class_name in corpus.keys():
        classifications.append({'class_name': class_name, 'score': calculate_class_score(corpus, sentence, class_name)})    
    return classifications

In [492]:
def normalize_scores(classification):
    total_score = sum(score['score'] for score in classification['scores'])
    if total_score != 0:
        for score in classification['scores']:
            score['score'] = score['score']/total_score
    return classification

In [493]:
def boost_score_targets(corpus, targets, boost_val):
    boosted_dict = {}
    for k in corpus.keys():
        if k in targets:
            boosted_dict.update({k: { key:value*boost_val for (key,value) in corpus[k].items() }})
        else:
            boosted_dict.update({k: { key:value for (key,value) in corpus[k].items() }})
    return boosted_dict

In [646]:
def get_accuracy(classifications):
    accuracies = {
        'usability': 0,
        'not_usability': 0,
        'draws': 0
    }
    class_usa = []
    for classification in classifications:
        scores = [item['score'] for item in classification['scores']]
        is_draw = len(set(scores)) == 1
        if is_draw:
            accuracies['draws'] +=1
            continue
        highest_score = max(classification['scores'], key=lambda x:x['score'])
        is_draw = False
        if classification['test_data']['class_name'] == highest_score['class_name']:
            accuracies[highest_score['class_name']] += 1
            if(classification['test_data']['class_name']=='usability'):
                class_usa.append(classification)
        else:
            if(classification['test_data']['class_name']=='not_usability'):
                print(classification)
    return accuracies

In [635]:
def drop_unwanted_words(corpus, droplist):
    filtered_dict = {}
    for k in corpus.keys():
        filtered_dict.update({k: { key:value for (key,value) in corpus[k].items() if key not in droplist }})
    return filtered_dict

In [636]:
droplist = open(DROPLIST_FILENAME).read().splitlines()
corpus = train(train_dataset)
# corpus = drop_low_score(corpus, 10, 'usability') # magic number
# corpus = drop_low_score(corpus, 10, 'not_usability') # magic number
corpus = drop_unwanted_words(corpus, droplist)
corpus = boost_score_targets(corpus, ['usability'], 1) # magic number
corpus = boost_score_targets(corpus, ['not_usability'], 20) # magic number

In [637]:
for key in corpus.keys():
    print(f' total words for {key}: {len(corpus[key].keys())} ')

 total words for usability: 810 
 total words for not_usability: 86 


In [640]:
classifications = [{'test_data': test_data, 'scores': classificate(corpus, test_data['text'])} for test_data in test_dataset]
classifications = [normalize_scores(classification) for classification in classifications]

In [648]:
accuracy, cls_usa = get_accuracy(classifications)

{'test_data': {'ID': 'gp:AOqpTOGycTCJ9fSb3Flzi-E7KWoPeWpGhoH03kkRc5XNaa7YurpjLwVNyp-OvJPjM_QLkhaVe_nnK0DCaQR46w', 'text': ['funcion'], 'is_classified': True, 'class_name': 'not_usability'}, 'scores': [{'class_name': 'usability', 'score': 0.7802197802197802}, {'class_name': 'not_usability', 'score': 0.21978021978021978}]}
{'test_data': {'ID': 'gp:AOqpTOGyXTwzF9TQXXZPQOgXid8XoG2-tMi_SkEb2xyJutEd0JquYxnC6YJqxBtOPMo-1AFaQtpJYkb8yE055Q', 'text': ['nao', 'consegu', 'entrar'], 'is_classified': True, 'class_name': 'not_usability'}, 'scores': [{'class_name': 'usability', 'score': 0.6491228070175439}, {'class_name': 'not_usability', 'score': 0.3508771929824561}]}
{'test_data': {'ID': 'gp:AOqpTOH86v8Fg39h4yjKdfsm4F3RYg7PWGGpwBg9PUabbYpBR4V-6DkBQ_AS28Z-6UJ2LLkNomZ1kxebV2dcOA', 'text': ['aplic', 'pessim'], 'is_classified': True, 'class_name': 'not_usability'}, 'scores': [{'class_name': 'usability', 'score': 0.782608695652174}, {'class_name': 'not_usability', 'score': 0.21739130434782608}]}
{'test_d

TypeError: 'builtin_function_or_method' object is not subscriptable

In [622]:
print(f'Right answers Usability: {accuracy["usability"]} of {len([data for data in test_dataset if data["class_name"]=="usability"])} ({accuracy["usability"]/len([data for data in test_dataset if data["class_name"]=="usability"])})')
print(f'Right answers Not Usability: {accuracy["not_usability"]} of {len([data for data in test_dataset if data["class_name"]=="not_usability"])} ({accuracy["not_usability"]/len([data for data in test_dataset if data["class_name"]=="not_usability"])})')
print(f'Draws: {accuracy["draws"]}')

Right answers Usability: 205 of 214 (0.9579439252336449)
Right answers Not Usability: 29 of 66 (0.4393939393939394)
Draws: 3


In [647]:
# is this correct?
classifications = [{'test_data': test_data, 'scores': classificate(corpus, test_data['text'])} for test_data in df]
classifications = [normalize_scores(classification) for classification in classifications]
accuracy = get_accuracy(classifications)

{'test_data': {'ID': 'gp:AOqpTOGycTCJ9fSb3Flzi-E7KWoPeWpGhoH03kkRc5XNaa7YurpjLwVNyp-OvJPjM_QLkhaVe_nnK0DCaQR46w', 'text': ['funcion'], 'is_classified': True, 'class_name': 'not_usability'}, 'scores': [{'class_name': 'usability', 'score': 0.7802197802197802}, {'class_name': 'not_usability', 'score': 0.21978021978021978}]}
{'test_data': {'ID': 'gp:AOqpTOGyXTwzF9TQXXZPQOgXid8XoG2-tMi_SkEb2xyJutEd0JquYxnC6YJqxBtOPMo-1AFaQtpJYkb8yE055Q', 'text': ['nao', 'consegu', 'entrar'], 'is_classified': True, 'class_name': 'not_usability'}, 'scores': [{'class_name': 'usability', 'score': 0.6491228070175439}, {'class_name': 'not_usability', 'score': 0.3508771929824561}]}
{'test_data': {'ID': 'gp:AOqpTOH86v8Fg39h4yjKdfsm4F3RYg7PWGGpwBg9PUabbYpBR4V-6DkBQ_AS28Z-6UJ2LLkNomZ1kxebV2dcOA', 'text': ['aplic', 'pessim'], 'is_classified': True, 'class_name': 'not_usability'}, 'scores': [{'class_name': 'usability', 'score': 0.782608695652174}, {'class_name': 'not_usability', 'score': 0.21739130434782608}]}
{'test_d

TypeError: 'builtin_function_or_method' object is not subscriptable

In [610]:
print(f'Right answers Usability: {accuracy["usability"]} of {len([data for data in df if data["class_name"]=="usability"])} ({accuracy["usability"]/len([data for data in df if data["class_name"]=="usability"])})')
print(f'Right answers Not Usability: {accuracy["not_usability"]} of {len([data for data in df if data["class_name"]=="not_usability"])} ({accuracy["not_usability"]/len([data for data in df if data["class_name"]=="not_usability"])})')
print(f'Draws: {accuracy["draws"]}')

Right answers Usability: 693 of 711 (0.9746835443037974)
Right answers Not Usability: 140 of 219 (0.639269406392694)
Draws: 36


In [611]:
corpus

{'usability': {'consig': 137,
  'rost': 20,
  'acess': 90,
  'erro': 44,
  'cadastr': 156,
  'fot': 30,
  'porqu': 13,
  'ruim': 24,
  'dad': 32,
  'sempr': 12,
  'coloc': 17,
  'cnh': 28,
  'numer': 17,
  'cpf': 39,
  'ped': 27,
  'sei': 11,
  'fac': 47,
  'pod': 23,
  'ter': 26,
  'opca': 26,
  'faz': 101,
  'tod': 24,
  'fiz': 37,
  'q': 25,
  'mail': 17,
  'cont': 57,
  'aind': 11,
  'tent': 55,
  'mud': 13,
  'fal': 21,
  'recuper': 67,
  'senh': 176,
  'entrar': 36,
  'outr': 22,
  'form': 11,
  'reconhec': 63,
  'facial': 66,
  'dev': 13,
  'app': 74,
  'validaca': 19,
  'lix': 17,
  'temp': 21,
  'biometr': 28,
  'consegu': 38,
  'aplic': 72,
  'celul': 14,
  'prov': 21,
  'vid': 27,
  'diz': 54,
  'tir': 17,
  'dess': 13,
  'ser': 19,
  'voc': 17,
  'cri': 19,
  'serv': 14,
  'quer': 12,
  'fic': 29,
  'dificil': 13,
  'redefin': 14,
  'funcion': 71,
  'sit': 16,
  'jeit': 11,
  'cert': 13,
  'ta': 17,
  'nad': 37,
  'problem': 17,
  'email': 24,
  'hor': 16,
  'const': 12,
  

In [642]:
classifications_usa = []

[]