# TCC Augusto e Ícaro
## Modelo de automatização das Heurísticas de Nielsen para comentários em reviews de Apps

* Versão 0.2.0
* Bibliotecas utilizadas: pandas, numpy, texthero, ntlk e corpus do ntlk em português
* Dataset utilizado: dataset_v6.csv
* Data: 01/08/2020

### Objetivos, incrementos e correções:

* Encapsulamento da pipeline de pre-processamento
* Utilizando dataset com classificação revisada para melhor acurácia
* Aumento dos pesos de termos classificados como usabilidade
* Foco em detalhar e evoluir o dicionário de radicais classificados

In [1]:
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install texthero 
!{sys.executable} -m pip install nltk

Requirement already up-to-date: pip in /home/icaro/.pyenv/versions/3.7.3/lib/python3.7/site-packages (20.2)




In [2]:
import pandas as pd
import texthero as hero
import nltk
from nltk.corpus import stopwords
import numpy as np
# nltk.download()

## Pré-processamento

In [34]:
DROPLIST_FILENAME='droplist.txt'
MODEL_FILENAME='model_v0.2.0.json'
DATASET_FILENAME='reviews_v6.csv'

In [103]:
def pre_process(df_path, df_cols):
    df = pd.read_csv(df_path, index_col=False, usecols=df_cols)
    df = df[df['is_classified']==True]
    df = df.rename(columns={'Text': 'text'})
    df['text'] = hero.preprocessing.clean(df['text'])
    pt_stopwords = stopwords.words('portuguese')
    df['text'] = hero.remove_stopwords(df['text'], stopwords=pt_stopwords)
    df['text'] = hero.stem(df['text'], language='portuguese')
    df['text'] = hero.tokenize(df['text'])
    df['class_name'] = df.apply(lambda row: 'usability' if row['is_usability'] else 'not_usability', axis=1)
    df = df.drop('is_usability', 1)
    df = df.to_dict('records')
    return df    

In [104]:
def split_train_test(df, train_size):
    return np.split(df, [int(len(df)*train_size)])

In [105]:
df = pre_process(DATASET_FILENAME, ['ID', 'Text', 'is_usability', 'is_classified'])
train_dataset, test_dataset = split_train_test(df, train_size=0.7)

In [106]:
print(f'train: {len(train_dataset)} test: {len(test_dataset)}')

train: 356 test: 153


In [107]:
print(f'[info] total usability train data {sum(value["class_name"] == "usability" for value in train_dataset)}')
print(f'[info] total not_usability train data {sum(value["class_name"] == "not_usability" for value in train_dataset)}')
print(f'[info] total usability test data {sum(value["class_name"] == "usability" for value in test_dataset)}')
print(f'[info] total not_usability test data {sum(value["class_name"] == "not_usability" for value in test_dataset)}')

[info] total usability train data 121
[info] total not_usability train data 235
[info] total usability test data 144
[info] total not_usability test data 9


## Classificador

In [108]:
def train(training_data):
    corpus_words = {}
    for data in training_data: 
        class_name = data['class_name']
        frase = data['text']
        if class_name not in list(corpus_words.keys()):
            corpus_words[class_name] = {}
        for word in frase:
            if word not in list(corpus_words[class_name].keys()):
                corpus_words[class_name][word] = 1
            else:
                corpus_words[class_name][word] += 1
    return corpus_words

In [109]:
def drop_low_score(corpus_words, min_value):
    fitered_dict = {}
    for k in corpus.keys():
        fitered_dict.update({k: { key:value for (key,value) in corpus[k].items() if value > min_value }})
    return fitered_dict

In [110]:
def classificate(corpus, sentence):
    def calculate_class_score(corpus_words, sentence, class_name):
        score = 0 
        for word in sentence:
            if word in corpus_words[class_name]:
                score += corpus_words[class_name][word]
        return score
    classifications = []
    for class_name in corpus.keys():
        classifications.append({'class_name': class_name, 'score': calculate_class_score(corpus, sentence, class_name)})    
    return classifications

In [111]:
def normalize_scores(classification):
    total_score = sum(score['score'] for score in classification['scores'])
    if total_score != 0:
        for score in classification['scores']:
            score['score'] = score['score']/total_score
    return classification

In [112]:
def boost_score_targets(corpus, targets, boost_val):
    boosted_dict = {}
    for k in corpus.keys():
        if k in targets:
            boosted_dict.update({k: { key:value*boost_val for (key,value) in corpus[k].items() }})
        else:
            boosted_dict.update({k: { key:value for (key,value) in corpus[k].items() }})
    return boosted_dict

In [113]:
def get_accuracy(classifications):
    accuracies = {
        'usability': 0,
        'not_usability': 0
    }
    for classification in classifications:
        highest_score = max(classification['scores'], key=lambda x:x['score'])
        if classification['test_data']['class_name'] == highest_score['class_name']:
            accuracies[highest_score['class_name']] += 1
    return accuracies

In [114]:
def drop_unwanted_words(corpus, droplist):
    filtered_dict = {}
    for k in corpus.keys():
        filtered_dict.update({k: { key:value for (key,value) in corpus[k].items() if key not in droplist }})
    return filtered_dict

In [120]:
droplist = open(DROPLIST_FILENAME).read().splitlines()
corpus = train(train_dataset)
# corpus = drop_low_score(corpus, 3) # magic number
corpus = drop_unwanted_words(corpus, droplist)
corpus = boost_score_targets(corpus, ['usability'], 1.4) # magic number
corpus = boost_score_targets(corpus, ['not_usability'], 1) # magic number

In [121]:
for key in corpus.keys():
    print(f' total words for {key}: {len(corpus[key].keys())} ')

 total words for not_usability: 277 
 total words for usability: 211 


In [122]:
classifications = [{'test_data': test_data, 'scores': classificate(corpus, test_data['text'])} for test_data in test_dataset]
classifications = [normalize_scores(classification) for classification in classifications]

In [123]:
accuracy = get_accuracy(classifications)

print(f'Right answers Usability: {accuracy["usability"]} of {len([data for data in test_dataset if data["class_name"]=="usability"])} ({accuracy["usability"]/len([data for data in test_dataset if data["class_name"]=="usability"])})')
print(f'Right answers Not Usability: {accuracy["not_usability"]} of {len([data for data in test_dataset if data["class_name"]=="not_usability"])} ({accuracy["not_usability"]/len([data for data in test_dataset if data["class_name"]=="not_usability"])})')

Right answers Usability: 139 of 144 (0.9652777777777778)
Right answers Not Usability: 5 of 9 (0.5555555555555556)


In [124]:
# is this correct?
classifications = [{'test_data': test_data, 'scores': classificate(corpus, test_data['text'])} for test_data in df]
classifications = [normalize_scores(classification) for classification in classifications]
accuracy = get_accuracy(classifications)

print(f'Right answers Usability: {accuracy["usability"]} of {len([data for data in df if data["class_name"]=="usability"])} ({accuracy["usability"]/len([data for data in df if data["class_name"]=="usability"])})')
print(f'Right answers Not Usability: {accuracy["not_usability"]} of {len([data for data in df if data["class_name"]=="not_usability"])} ({accuracy["not_usability"]/len([data for data in df if data["class_name"]=="not_usability"])})')

Right answers Usability: 255 of 265 (0.9622641509433962)
Right answers Not Usability: 194 of 244 (0.7950819672131147)
