In [1]:
import pandas as pd
import gzip
import ast

In [2]:
review = "../Dataset/user_reviews.json.gz"
data = []

with gzip.open(review, 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

reviews = pd.DataFrame(data)

In [3]:
reviews.head(10)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
5,Wackky,http://steamcommunity.com/id/Wackky,"[{'funny': '', 'posted': 'Posted May 5, 2014.'..."
6,76561198079601835,http://steamcommunity.com/profiles/76561198079...,"[{'funny': '1 person found this review funny',..."
7,MeaTCompany,http://steamcommunity.com/id/MeaTCompany,"[{'funny': '', 'posted': 'Posted July 24.', 'l..."
8,76561198089393905,http://steamcommunity.com/profiles/76561198089...,"[{'funny': '5 people found this review funny',..."
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,"[{'funny': '', 'posted': 'Posted June 16.', 'l..."


Desanidamos la columna reviews

In [4]:
desanidar = [pd.json_normalize(user, 'reviews') for user in data]
#reviews = pd.concat(desanidar, ignore_index=True)

In [5]:
reviews.head(2)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."


In [6]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


Creo la columna year

In [7]:
# Extraigo el año de la columna 'posted' y lo guardo en una nueva columna 'posted year'
# extract(r'(\d{4})') es una expresión regular para extraer una secuencia de 4 dígitos que en este caso seria el año
reviews['posted year'] = reviews['posted'].str.extract(r'(\d{4})') 

KeyError: 'posted'

Elimino la columna posted

In [None]:
reviews.drop('posted', axis=1, inplace=True)

In [None]:
reviews.head(2)

Unnamed: 0,funny,last_edited,item_id,helpful,recommend,review,posted year
0,,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011
1,,,22200,No ratings yet,True,It's unique and worth a playthrough.,2011


In [None]:
#Revisamos cuantos valores nulos hay en item_id
reviews['item_id'].isna().value_counts()

item_id
False    59305
Name: count, dtype: int64

Deberemos hacer el analisis de sentimientos 

In [None]:
# Para realizar el analisis de sentimiento importo la libreria textblob
from textblob import TextBlob

In [None]:
def analisis_sentimiento(review: str) -> int:
    analisis = TextBlob(review)
    if analisis.sentiment.polarity < 0:
        return 0  # Sentimiento negativo
    elif analisis.sentiment.polarity == 0:
        return 1  # Sentimiento neutro
    else:
        return 2  # Sentimiento positivo

# Asumiendo que 'reviews' es tu DataFrame original o una copia explícita
reviews.loc[:, 'analisis_sentimiento'] = reviews['review'].apply(analisis_sentimiento)

In [None]:
reviews.head(2)

Unnamed: 0,funny,last_edited,item_id,helpful,recommend,review,posted year,analisis_sentimiento
0,,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011,2
1,,,22200,No ratings yet,True,It's unique and worth a playthrough.,2011,2


In [14]:
df_reviews = reviews.drop(columns=['funny','last_edited','helpful'])

In [24]:
df_reviews.head(2)

Unnamed: 0,item_id,recommend,review,posted year,analisis_sentimiento
0,1250,True,Simple yet with great replayability. In my opi...,2011,2
1,22200,True,It's unique and worth a playthrough.,2011,2


In [25]:
df_reviews.to_parquet('../Dataset/reviews_clean.parquet')