In [1]:
import pandas as pd
import gzip
import ast

In [2]:
review = "../Dataset/user_reviews.json.gz"
data = []

with gzip.open(review, 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

reviews = pd.DataFrame(data)

In [3]:
reviews.head(10)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
5,Wackky,http://steamcommunity.com/id/Wackky,"[{'funny': '', 'posted': 'Posted May 5, 2014.'..."
6,76561198079601835,http://steamcommunity.com/profiles/76561198079...,"[{'funny': '1 person found this review funny',..."
7,MeaTCompany,http://steamcommunity.com/id/MeaTCompany,"[{'funny': '', 'posted': 'Posted July 24.', 'l..."
8,76561198089393905,http://steamcommunity.com/profiles/76561198089...,"[{'funny': '5 people found this review funny',..."
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,"[{'funny': '', 'posted': 'Posted June 16.', 'l..."


Procedo a eliminar la columna que no voy a utilizar y revisar la cantidad de vacios 

In [4]:
reviews.drop('user_url', axis=1, inplace=True)

In [5]:
for column in reviews.columns:
    print(f'Hay {reviews[column].isna().sum()} valores nulos en {column}')

Hay 0 valores nulos en user_id
Hay 0 valores nulos en reviews


In [6]:
review = reviews.explode('reviews').reset_index(drop=True)

In [7]:
review.head(5)

Unnamed: 0,user_id,reviews
0,76561197970982479,"{'funny': '', 'posted': 'Posted November 5, 20..."
1,76561197970982479,"{'funny': '', 'posted': 'Posted July 15, 2011...."
2,76561197970982479,"{'funny': '', 'posted': 'Posted April 21, 2011..."
3,js41637,"{'funny': '', 'posted': 'Posted June 24, 2014...."
4,js41637,"{'funny': '', 'posted': 'Posted September 8, 2..."


Normalizo los datos que existen en reviews

In [8]:
review = pd.json_normalize(review['reviews'])

In [9]:
for column in review.columns:
    print(f'Hay {review[column].isna().sum()} valores nulos en {column}')

Hay 28 valores nulos en funny
Hay 28 valores nulos en posted
Hay 28 valores nulos en last_edited
Hay 28 valores nulos en item_id
Hay 28 valores nulos en helpful
Hay 28 valores nulos en recommend
Hay 28 valores nulos en review


In [10]:
review = review[['item_id', 'review']]
review.head(3)

Unnamed: 0,item_id,review
0,1250,Simple yet with great replayability. In my opi...
1,22200,It's unique and worth a playthrough.
2,43110,Great atmosphere. The gunplay can be a bit chu...


In [11]:
review['review'] = review['review'].astype('str')

In [12]:
# Para realizar el analisis de sentimiento importo la libreria textblob
from textblob import TextBlob
def analisis_sentimiento(review: str) -> int:
    analisis = TextBlob(review)
    if analisis.sentiment.polarity < 0:
        return 0  # Sentimiento negativo
    elif analisis.sentiment.polarity == 0:
        return 1  # Sentimiento neutro
    else:
        return 2  # Sentimiento positivo

In [13]:
review['Analisis_Sentimientos'] = review['review'].apply(analisis_sentimiento)

In [14]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   item_id                59305 non-null  object
 1   review                 59333 non-null  object
 2   Analisis_Sentimientos  59333 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [16]:
review

Unnamed: 0,item_id,review,Analisis_Sentimientos
0,1250,Simple yet with great replayability. In my opi...,2
1,22200,It's unique and worth a playthrough.,2
2,43110,Great atmosphere. The gunplay can be a bit chu...,2
3,251610,I know what you think when you see this title ...,2
4,227300,For a simple (it's actually not all that simpl...,0
...,...,...,...
59328,70,a must have classic from steam definitely wort...,2
59329,362890,this game is a perfect remake of the original ...,2
59330,273110,had so much fun plaing this and collecting res...,2
59331,730,:D,2


In [17]:
bins = [-float('inf'), 0, 0.3, float('inf')]
labels = [0, 1, 2]

# Crear la columna 'sentimiento' usando pd.cut
review['Analisis_Sentimientos'] = pd.cut(review['Analisis_Sentimientos'], bins=bins, labels=labels)

# Crear columnas binarias
review['negativo'] = (review['Analisis_Sentimientos'] == 0).astype(int)
review['neutral'] = (review['Analisis_Sentimientos'] == 1).astype(int)
review['positivo'] = (review['Analisis_Sentimientos'] == 2).astype(int)

In [18]:
review

Unnamed: 0,item_id,review,Analisis_Sentimientos,negativo,neutral,positivo
0,1250,Simple yet with great replayability. In my opi...,2,0,0,1
1,22200,It's unique and worth a playthrough.,2,0,0,1
2,43110,Great atmosphere. The gunplay can be a bit chu...,2,0,0,1
3,251610,I know what you think when you see this title ...,2,0,0,1
4,227300,For a simple (it's actually not all that simpl...,0,1,0,0
...,...,...,...,...,...,...
59328,70,a must have classic from steam definitely wort...,2,0,0,1
59329,362890,this game is a perfect remake of the original ...,2,0,0,1
59330,273110,had so much fun plaing this and collecting res...,2,0,0,1
59331,730,:D,2,0,0,1


In [19]:
reviews = review[['item_id', 'negativo', 'neutral', 'positivo']]

In [20]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   item_id   59305 non-null  object
 1   negativo  59333 non-null  int32 
 2   neutral   59333 non-null  int32 
 3   positivo  59333 non-null  int32 
dtypes: int32(3), object(1)
memory usage: 1.1+ MB


In [21]:
reviews = reviews.groupby('item_id').agg({
    'negativo': 'sum',
    'neutral': 'sum',
    'positivo': 'sum'
}).reset_index()

In [22]:
reviews

Unnamed: 0,item_id,negativo,neutral,positivo
0,10,6,0,51
1,10090,5,0,47
2,10130,0,0,2
3,10140,1,0,0
4,10150,0,0,9
...,...,...,...,...
3677,99400,1,0,0
3678,99700,0,0,3
3679,99810,2,0,6
3680,99900,30,0,124


In [39]:
reviews.to_parquet('../Dataset/reviews_clean.parquet')