In [61]:
import pandas as pd
import gzip
import ast

In [62]:
review = "../Dataset/user_reviews.json.gz"
data = []

with gzip.open(review, 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

reviews = pd.DataFrame(data)

In [63]:
reviews.head(10)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
5,Wackky,http://steamcommunity.com/id/Wackky,"[{'funny': '', 'posted': 'Posted May 5, 2014.'..."
6,76561198079601835,http://steamcommunity.com/profiles/76561198079...,"[{'funny': '1 person found this review funny',..."
7,MeaTCompany,http://steamcommunity.com/id/MeaTCompany,"[{'funny': '', 'posted': 'Posted July 24.', 'l..."
8,76561198089393905,http://steamcommunity.com/profiles/76561198089...,"[{'funny': '5 people found this review funny',..."
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,"[{'funny': '', 'posted': 'Posted June 16.', 'l..."


Desanidamos la columna reviews

In [64]:
reviews = reviews.explode('reviews') 
reviews = pd.concat([reviews.drop(['reviews'],axis=1), reviews['reviews'].apply(pd.Series)], axis=1)
reviews.rename(columns={0: 'analisis_sentimiento'}, inplace=True)
reviews['analisis_sentimiento'] = 0

In [65]:
reviews.head(2)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,analisis_sentimiento
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,0


In [66]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59333 entries, 0 to 25798
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   user_id               59333 non-null  object
 1   user_url              59333 non-null  object
 2   funny                 59305 non-null  object
 3   posted                59305 non-null  object
 4   last_edited           59305 non-null  object
 5   item_id               59305 non-null  object
 6   helpful               59305 non-null  object
 7   recommend             59305 non-null  object
 8   review                59305 non-null  object
 9   analisis_sentimiento  59333 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 5.0+ MB


Creo la columna year

In [67]:
# Extraigo el año de la columna 'posted' y lo guardo en una nueva columna 'posted year'
# extract(r'(\d{4})') es una expresión regular para extraer una secuencia de 4 dígitos que en este caso seria el año
reviews['posted year'] = reviews['posted'].str.extract(r'(\d{4})') 

Elimino la columna posted

In [68]:
reviews.drop('posted', axis=1, inplace=True)

In [69]:
reviews.head(2)

Unnamed: 0,user_id,user_url,funny,last_edited,item_id,helpful,recommend,review,analisis_sentimiento,posted year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,0,2011
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,22200,No ratings yet,True,It's unique and worth a playthrough.,0,2011


In [70]:
#Revisamos cuantos valores nulos hay en item_id
reviews['item_id'].isna().value_counts()

item_id
False    59305
True        28
Name: count, dtype: int64

In [71]:
# Eliminamos los valores nulos
df_reviews = reviews.dropna(subset=['item_id'])
df_reviews.shape

(59305, 10)

In [72]:
df_reviews.head()

Unnamed: 0,user_id,user_url,funny,last_edited,item_id,helpful,recommend,review,analisis_sentimiento,posted year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,0,2011
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,22200,No ratings yet,True,It's unique and worth a playthrough.,0,2011
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,0,2011
1,js41637,http://steamcommunity.com/id/js41637,,,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,0,2014
1,js41637,http://steamcommunity.com/id/js41637,,,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,0,2013


Deberemos hacer el analisis de sentimientos 

In [73]:
df_reviews = reviews.drop(columns=['user_url','funny','last_edited','helpful'])

In [74]:
df_reviews.head(2)

Unnamed: 0,user_id,item_id,recommend,review,analisis_sentimiento,posted year
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,0,2011
0,76561197970982479,22200,True,It's unique and worth a playthrough.,0,2011


In [75]:
# Para realizar el analisis de sentimiento importo la libreria textblob
from textblob import TextBlob

In [76]:
def analisis_sentimiento(review: str) -> int:
    analisis = TextBlob(review)
    if analisis.sentiment.polarity < 0:
        return 0  # Sentimiento negativo
    elif analisis.sentiment.polarity == 0:
        return 1  # Sentimiento neutro
    else:
        return 2  # Sentimiento positivo

# Asumiendo que 'reviews' es tu DataFrame original o una copia explícita
df_reviews.loc[:, 'analisis_sentimiento'] = df_reviews['review'].apply(analisis_sentimiento)

TypeError: The `text` argument passed to `__init__(text)` must be a string, not <class 'float'>

In [60]:
df_reviews.head(2)

Unnamed: 0,user_id,user_url,funny,last_edited,item_id,helpful,recommend,review,analisis_sentimiento,posted year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2,2011
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,22200,No ratings yet,True,It's unique and worth a playthrough.,2,2011


In [44]:
df_reviews = reviews.drop(columns=['user_url','funny','last_edited','helpful'])

In [45]:
df_reviews.head(2)

Unnamed: 0,user_id,item_id,recommend,review,analisis_sentimiento,posted year
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,0,2011
0,76561197970982479,22200,True,It's unique and worth a playthrough.,0,2011
