In [1]:
import math
import numpy as np
import pandas as pd
from src.knn import KNN, WeightedKNN

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
pd.options.display.max_rows = 999


reviews_sentiment = pd.read_csv('datasets/reviews_sentiment.csv', delimiter=";")

for i, row in reviews_sentiment.iterrows():
    wc = len(row['reviewText'].strip().split(' '))
    if wc != row['wordcount']:
#         print(f"El registro {i} tiene el campo wordcount incorrecto, corrigiendo...")
        reviews_sentiment.at[i,'wordcount'] = wc

# print(sentiments)

# cambiar positivo a 1, negativo a 0.
reviews_sentiment['titleSentiment'].replace('positive', 1, inplace=True)
reviews_sentiment['titleSentiment'].replace('negative', 0, inplace=True)

ones = reviews_sentiment[reviews_sentiment['titleSentiment'] == 1]['titleSentiment'].aggregate('sum')
ceroes = reviews_sentiment[reviews_sentiment['titleSentiment'] == 0]['titleSentiment'].aggregate('sum')

prop = ones/len(reviews_sentiment)
print(prop) 

reviews_sentiment['titleSentiment'].fillna(0.5, inplace=True)

attributes = ['wordcount', 'titleSentiment', 'sentimentValue']
target = ['starRating']

# quitar los NaNs y verificar que no sea una porcion importante del conjunto de datos
nan_title_sentiment = reviews_sentiment[reviews_sentiment['titleSentiment'].isna()]
nan_text_sentiment = reviews_sentiment[reviews_sentiment['textSentiment'].isna()]
print(f"Cantidad de NaNs en titleSentiment {len(nan_title_sentiment)}")
print(f"Cantidad de registros {len(reviews_sentiment)}")
reviews_sentiment = reviews_sentiment.dropna()
print(f"Cantidad de registros despues de sacar NaNs: {len(reviews_sentiment)}.")

reviews_sentiment = reviews_sentiment[target + attributes]

print(reviews_sentiment)


0.7587548638132295
Cantidad de NaNs en titleSentiment 0
Cantidad de registros 257
Cantidad de registros despues de sacar NaNs: 257.
     starRating  wordcount  titleSentiment  sentimentValue
0             1         20             0.0       -0.486389
1             1          6             0.0       -0.586187
2             1          4             0.5       -0.602240
3             1         17             0.5       -0.616271
4             1          6             0.0       -0.651784
5             1          8             1.0       -0.720443
6             1         11             1.0       -0.726825
7             1         16             1.0       -0.736769
8             1          3             1.0       -0.765284
9             1         13             0.0       -0.797961
10            1          4             1.0       -0.833488
11            1          9             0.0       -0.838467
12            1          2             1.0       -0.888559
13            1         23             0.0

In [2]:
# Pregunta a)
print('Los comentarios valorados con 1 estrella, ¿que cantidad promedio de palabras tienen?')

mean = reviews_sentiment[reviews_sentiment['starRating'] == 1]['wordcount'].aggregate('mean')
print(f"El promedio es {round(mean, 2)}")

Los comentarios valorados con 1 estrella, ¿que cantidad promedio de palabras tienen?
El promedio es 8.16


In [3]:
from src.aux_functions import normalize_df


register_class = np.array(reviews_sentiment.starRating)
classes = np.array(reviews_sentiment.starRating.unique())

# print(sentiments.star_rating.unique())

# Se normalizan los datos por cada columna
norm_data = np.array(normalize_df(reviews_sentiment[attributes])) # me queda un arreglo de registros con 3 atributos

crossed_validation = 10
batch_size = math.floor(len(norm_data)/crossed_validation)

# valores de precision para cada corrido de validacion cruzada
knn_precisions = np.zeros(crossed_validation)
weight_knn_precisions = np.zeros(crossed_validation)


In [4]:
from src.aux_functions import confusion_matrix
from src.aux_functions import plot_matrix
from src.aux_functions import plot_precision


for i in range(crossed_validation):

    # separo por lotes al conjunto de entrenamiento/testeo para la validacion cruzada
    test_batch = np.array(range(batch_size * i, batch_size * (i + 1), 1))

    # b) Dividir el conjunto de datos en un conjunto de entrenamiento y otro de prueba.
    X = np.delete(norm_data, test_batch, axis = 0)
    f_X = np.delete(register_class, test_batch, axis = 0)
    Y = norm_data[test_batch[0]:(test_batch[-1] + 1)]
    f_Y = register_class[test_batch[0]:(test_batch[-1] + 1)]

    knn = KNN(X, f_X, classes)
    weight_knn = WeightedKNN(X, f_X, classes)

    predictions = knn.batch_classify(Y)
    weight_predictions = weight_knn.batch_classify(Y)
    
    knn_confusion = confusion_matrix(predictions, f_Y, classes)
    weight_knn_confusion = confusion_matrix(weight_predictions, f_Y, classes)

    knn_precisions[i] = knn_confusion.trace()/knn_confusion.sum()
    weight_knn_precisions[i] = weight_knn_confusion.trace()/weight_knn_confusion.sum()
    plot_matrix(weight_knn_confusion, f'crossed_validation_{crossed_validation}_{i}.png')



In [5]:
plot_precision(knn_precisions, weight_knn_precisions, crossed_validation, f'crossed_validation_{crossed_validation}.png')

print()
# print(f'Para el KNN pesado, la precision promedio resulto: {w_knn_precisions.mean()} con un valor maximo de: {w_knn_precisions.max()}')




In [6]:
from src.aux_functions import plot_precision_k

knn_means = []
weight_knn_means = []

# neighbours = np.arange(start=3, stop=20, step=2)
# for k in neighbours:

# iterations = np.arange(start=1, stop=15, step=1)
# for index in iterations:
#     validation_range = neighbours = np.arange(start=3, stop=20, step=1)
#     knn_precisions_avg = np.zeros(len(validation_range))
#     weight_knn_precisions_avg = np.zeros(len(validation_range))

validation_range = neighbours = np.arange(start=3, stop=25, step=1)
for crossed_validation in validation_range:        

    iterations = np.arange(start=0, stop=15, step=1)
    knn_precisions_avg = np.zeros(len(iterations))
    weight_knn_precisions_avg = np.zeros(len(iterations))
    for index in iterations:

        reviews_sentiment = reviews_sentiment.sample(frac=1).reset_index(drop=True)
        register_class = np.array(reviews_sentiment.starRating)
        classes = np.array(reviews_sentiment.starRating.unique())
        # Se normalizan los datos por cada columna
        norm_data = np.array(normalize_df(reviews_sentiment[attributes])) # me queda un arreglo de registros con 3 atributos
        batch_size = math.floor(len(norm_data)/crossed_validation)

        # valores de precision para cada corrido de validacion cruzada
        knn_precisions = np.zeros(crossed_validation)
        weight_knn_precisions = np.zeros(crossed_validation)

        for i in range(crossed_validation):
            # separo por lotes al conjunto de entrenamiento/testeo para la validacion cruzada
            test_batch = np.array(range(batch_size * i, batch_size * (i + 1), 1))

            # b) Dividir el conjunto de datos en un conjunto de entrenamiento y otro de prueba.
            X = np.delete(norm_data, test_batch, axis = 0)
            f_X = np.delete(register_class, test_batch, axis = 0)
            Y = norm_data[test_batch[0]:(test_batch[-1] + 1)]
            f_Y = register_class[test_batch[0]:(test_batch[-1] + 1)]

            knn = KNN(X, f_X, classes)
            weight_knn = WeightedKNN(X, f_X, classes)

            predictions = knn.batch_classify(Y)
            weight_predictions = weight_knn.batch_classify(Y)

            knn_confusion = confusion_matrix(predictions, f_Y, classes)
            weight_knn_confusion = confusion_matrix(weight_predictions, f_Y, classes)

            knn_precisions[i] = knn_confusion.trace()/knn_confusion.sum()
            weight_knn_precisions[i] = weight_knn_confusion.trace()/weight_knn_confusion.sum()
        #     plot_matrix(weight_knn_confusion, f'crossed_validation_{crossed_validation}_{i}.png')


        knn_precisions_avg[index] = knn_precisions.mean()
        weight_knn_precisions_avg[index] = weight_knn_precisions.mean()

#     print(knn_precisions_avg.mean())
    knn_means.append(knn_precisions_avg.mean())
    #knn_stds.append(knn_precisions_avg.std())
    weight_knn_means.append(weight_knn_precisions_avg.mean())
    #w_knn_stds.append(weight_knn_precisions_avg.std())
print('knn[6]: ' + str(knn_means[6]))
print('knn_w[6]: ' + str(weight_knn_means[6]))
plot_precision_k(knn_means, weight_knn_means, validation_range, f'crossed_validation_knn_means.png')
#     print(f'Finished neighbour {neigh_k}.')
        

knn[6]: 0.6939153439153439
knn_w[6]: 0.6687830687830686


In [7]:
print('knn[6]: ' + str(knn_means[12]))
print('knn_w[6]: ' + str(weight_knn_means[10]))

knn[6]: 0.6901960784313724
knn_w[6]: 0.652901484480432
