In [28]:
%pip install textstat

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import nltk
from typing import Literal

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Error loading vader_lexicon: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [3]:
from nltk.sentiment import SentimentIntensityAnalyzer

def anilyze_sentiment(text: str) -> Literal[-1, 0, 1]:
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)
    
    if sentiment_score['compound'] >= 0.05:
        return 1
    elif sentiment_score['compound'] <= -0.05:
        return -1
    else:
        return 0

In [4]:
df = pd.read_csv('data.csv')

df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiments
0,0,0,A2HD75EMZR8QLN,700099867,123,"[8, 12]",Installing the game was a struggle (because of...,0.0,Pay to unlock content? I don't think so.,1341792000,"07 9, 2012",-1
1,1,1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,3.0,Good rally game,1372550400,"06 30, 2013",-1
2,2,2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",1st shipment received a book instead of the ga...,0.0,Wrong key,1403913600,"06 28, 2014",-1
3,3,4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I had Dirt 2 on Xbox 360 and it was an okay ga...,3.0,DIRT 3,1308009600,"06 14, 2011",1
4,4,5,A2UTRVO4FDCBH6,700099867,A.R.G.,"[0, 0]","Overall this is a well done racing game, with ...",3.0,"Good racing game, terrible Windows Live Requir...",1368230400,"05 11, 2013",1


In [5]:
# scores = {
#     0: 'terrible',
#     1: 'pretty bad',
#     2: 'not that bad',
#     3: 'regular',
#     4: 'good',
#     5: 'very good'
# }

# df['sentiments'] = [anilyze_sentiment(f'{df.reviewText[i]}. Because of this I give a score of {df.overall[i]} to this game and think it\'s {scores[df.overall[i]]}  ') for i in range(len(df))]

In [5]:
import numpy as np
import networkx as nx

from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
def create_graph(df):
    G = nx.Graph()
    
    # Crear nodos para usuarios y juegos
    for _, row in df.iterrows():
        G.add_node(row['reviewerID'], bipartite=0)
        G.add_node(row['asin'], bipartite=1)
        
        # Añadir arista con peso basado en sentiment y overall
        weight = (row['sentiments'] + 1) / 2 * (row['overall'] / 5)  # Normalizar a [0, 1]
        G.add_edge(row['reviewerID'], row['asin'], weight=weight)
    
    return G

def get_similar_users(G, user_id, game_id):
    # Obtener usuarios que han revisado el mismo juego
    similar_users = set(G.neighbors(game_id)) - {user_id}
    return similar_users

def personalized_pagerank(G, user_id, game_id):
    similar_users = get_similar_users(G, user_id, game_id)
    
    # Crear vector personalizado
    personalization = defaultdict(float)
    for user in similar_users:
        personalization[user] = 1.0
    
    # Ejecutar PageRank personalizado
    pagerank = nx.pagerank(G, personalization=personalization)
    return pagerank

def rank_reviews(df, user_id, game_id, top_n = 10):
    G = create_graph(df)
    pagerank = personalized_pagerank(G, user_id, game_id)
    
    print(pagerank)
    
    # Filtrar reviews del juego específico
    game_reviews = df[df['asin'] == game_id]
    
    # Calcular score para cada review
    scored_reviews = []
    for _, review in game_reviews.iterrows():
        reviewer_rank = pagerank.get(review['reviewerID'], 0)
        sentiment_score = (review['sentiments'] + 1) / 2  # Normalizar a [0, 1]
        helpful = [int(x) for x in review['helpful'][1:-1].split(', ')]
        helpfulness = helpful[0] / max(helpful[1], 1)
        
        score = (reviewer_rank + sentiment_score + helpfulness) / 3
        scored_reviews.append((score, review))
    
    # Ordenar reviews por score
    scored_reviews.sort(reverse=True, key=lambda x: x[0])``
    
    return scored_reviews[:top_n]

In [14]:
rev = rank_reviews(df,'A2HD75EMZR8QLN','B000006OVE')

{'A2HD75EMZR8QLN': 8.458003839168598e-06, '0700099867': 1.7336908349442736e-05, 'A3UR8NLLY1ZHCX': 1.916305042689347e-06, 'A1INA0F5CWW3J4': 0.0, 'A361M14PU2GUEG': 1.0383814647589728e-06, 'A2UTRVO4FDCBH6': 1.0383814647589728e-06, 'AN3YYDZAS3O1Y': 1.3845086196786306e-06, 'AQTC623NCESZW': 1.9947206147807397e-06, 'A1QJJU33VNC4S7': 3.382533506514288e-05, 'A2JLT2WY0F2HVI': 0.0, 'A38NXTZUFB1O2K': 1.3845086196786306e-06, 'ANW6EGY12V5XS': 1.3845086196786306e-06, 'AHT34BRYFBFT1': 4.709984274129588e-06, 'A248LSBZT4P38V': 1.3696853439163692e-05, 'AFS6WERAP409A': 2.8898376375857363e-06, 'A14L115LBOB0A5': 0.0, 'A2VYL51WQXC5KK': 1.0879216830392226e-05, 'A10AYT89XSCE46': 0.0, 'A29KT7UP7DLM1J': 0.0, 'A15PIAQT55GNCA': 1.3845086196786306e-06, 'A18YFCLL3GBD0T': 0.0, 'A1BHRNLW2L8KLD': 0.0, 'A1CB8HH9YJ2YZE': 6.922543098393153e-07, 'A2LQCBLLJVVR5T': 1.3845086196786306e-06, 'A37M0B3NHDHN9V': 1.5718538220295883e-07, '6050036071': 7.230527581336102e-06, 'A2QQ9KSQ44QZSE': 2.0958050960394513e-07, 'A2PJOSU5ND84S1':

In [12]:
rev[0][1]

Unnamed: 0.1                                                    131
Unnamed: 0                                                      140
reviewerID                                           A38AXXL7DMVFMQ
asin                                                     B000006OVE
reviewerName                        Vicente Drago "Shaun O'Donnell"
helpful                                                      [2, 2]
reviewText        Many may disregard the Breath of Fire series a...
overall                                                         4.0
summary                                     Square Soft third Rival
unixReviewTime                                           1008633600
reviewTime                                              12 18, 2001
sentiments                                                        1
Name: 131, dtype: object

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def content_diversity(reviews):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([r['reviewText'] for r in reviews])
    sim_matrix = cosine_similarity(tfidf_matrix)
    return 1 - sim_matrix.mean()

In [10]:
def perceived_helpfulness(reviews):
    helpful = []
    for review in reviews:
        h = [int(x) for x in review['helpful'][1:-1].split(', ')]
        helpful.append(h)

    return sum(r[0] / (r[1] + 1) for r in helpful) / len(reviews)

In [11]:
from datetime import datetime

def temporal_coverage(reviews):
    dates = [datetime.strptime(r['reviewTime'], '%m %d, %Y') for r in reviews]
    time_span = max(dates) - min(dates)
    return time_span.days / 365  # Normalizado a años

In [12]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

# Descargar recursos necesarios
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Definir los aspectos y sus palabras clave asociadas
aspects = {
    'facilidad para instalar': ['install', 'installation', 'setup', 'download', 'easy to install', 'quick install'],
    'jugabilidad': ['gameplay', 'controls', 'mechanics', 'playability', 'difficulty', 'challenging'],
    'graficos': ['graphics', 'visuals', 'animation', 'art style', 'design', 'beautiful', 'stunning', 'art', 'images'],
    'musica': ['music', 'soundtrack', 'audio', 'sound effects', 'voice acting', 'atmospheric'],
    'performance': ['performance', 'fps', 'frame rate', 'lag', 'glitches', 'bugs', 'optimization', 'frames', 'bug', 'bugged', 'glitch'],
    'experiencia general': ['fun', 'boring', 'repetitive', 'great', 'amazing', 'awesome']
}

def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar caracteres especiales
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenizar
    tokens = word_tokenize(text)
    # Eliminar stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lematizar
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

def check_aspect(review_text, aspect_keywords):
    tokens = preprocess_text(review_text)
    
    for keyword in aspect_keywords:
        if len(keyword.split()) > 1:  # Si es una frase
            if keyword in review_text.lower():
                return True
        else:  # Si es una palabra individual
            lemma_keyword = lemmatizer.lemmatize(keyword)
            if lemma_keyword in tokens:
                return True
    
    return False

def aspect_coverage(reviews, aspects):
    covered_aspects = {aspect: 0 for aspect in aspects}
    total_reviews = len(reviews)
    
    for review in reviews:
        review_text = review['reviewText']
        for aspect, keywords in aspects.items():
            if check_aspect(review_text, keywords):
                covered_aspects[aspect] += 1
    
    # Calcular el porcentaje de cobertura para cada aspecto
    coverage = {aspect: count / total_reviews for aspect, count in covered_aspects.items()}
    
    # Calcular la cobertura promedio
    average_coverage = sum(coverage.values()) / len(aspects)
    
    return coverage, average_coverage

[nltk_data] Downloading package punkt to /home/dukagin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/dukagin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dukagin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dukagin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dukagin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
def sentiment_diversity(reviews):
    sentiments = [review['sentiments'] for review in reviews]
    return len(set(sentiments)) / len(sentiments)

In [14]:
def rating_coverage(reviews):
    ratings = [review['overall'] for review in reviews]
    return (max(ratings) - min(ratings)) / 5  # Asumiendo que las calificaciones van de 0 a 5

In [17]:
import textstat

def readability_index(reviews):
    scores = [textstat.flesch_reading_ease(review['reviewText']) for review in reviews]
    return sum(scores) / len(scores)

In [18]:
reviews = [e[1] for e in rev]
rank = [e[0] for e in rev]

In [19]:
content_diversity(reviews)

0.7035248077517

In [20]:
perceived_helpfulness(reviews)

0.2833333333333333

In [21]:
temporal_coverage(reviews)

13.805479452054794

In [22]:
aspect_coverage_results, avg_coverage = aspect_coverage(reviews, aspects)

In [23]:
aspect_coverage_results

{'facilidad para instalar': 0.0,
 'jugabilidad': 0.3,
 'graficos': 0.4,
 'musica': 0.2,
 'performance': 0.0,
 'experiencia general': 0.7}

In [24]:
avg_coverage

0.26666666666666666

In [25]:
sentiment_diversity(reviews)

0.1

In [26]:
rating_coverage(reviews)

0.2

In [27]:
readability_index(reviews)

77.92