In [15]:
import pandas as pd
import pyarrow.parquet as pq

In [16]:
df_reviews = pq.read_table('Datos/dfreviewsOpen_compr.parquet').to_pandas()

Para empezar la limpieza, voy a remover todas las filas que tengan datos nulos en todas las columnas. 

In [19]:
df_reviews = df_reviews.dropna(how='all')

In [17]:
df_reviews.head(3)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250.0,No ratings yet,True,Simple yet with great replayability. In my opi...,
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200.0,No ratings yet,True,It's unique and worth a playthrough.,
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110.0,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,


In [18]:
df_reviews = df_reviews.drop(['0'], axis=1)

In [10]:
df_reviews.head(3)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250.0,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200.0,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110.0,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...


Explorando la columna 'item_id'.

In [26]:
df_reviews['item_id'].isnull().sum()

0

In [25]:
df_reviews.dropna(subset=['item_id'], inplace=True)

Quiero reemplazar el año de 'posted' por el año en que se hizo la ultima modificacion del review.

In [21]:
lista1 = df_reviews['posted'].str.extract(r'(\d{4})')
lista2 = df_reviews['last_edited'].str.extract(r'(\d{4})')

In [22]:
from numpy import nan

for i in range(len(lista1)):
  if lista2[0][i] is not nan:
    lista1[0][i] = lista2[0][i]
  else:
    continue

In [23]:
df_reviews['posted'] = lista1

In [24]:
df_reviews.head(1)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,2011,,1250.0,No ratings yet,True,Simple yet with great replayability. In my opi...


Quitando la columna 'funny', ya que es irrelevante para este caso. 

In [27]:
funny = df_reviews['funny']
df_reviews.drop('funny', axis=1, inplace=True)

Borrando las columnas 'user_url', 'helpful' y 'last_edited'.

In [28]:
user_url = df_reviews['user_url']

In [29]:
helpful = df_reviews['helpful']

In [30]:
last_edited = df_reviews['last_edited']

In [31]:
df_reviews.drop(columns=['user_url', 'helpful', 'last_edited'],axis=1, inplace=True)

In [32]:
df_reviews.head(1)

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,2011,1250.0,True,Simple yet with great replayability. In my opi...


Limpieza de símbolos y caracteres innecesarios:

In [33]:
df_reviews = df_reviews.apply(lambda x: x.astype(str).str.lower() if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace('-', ' ') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace('!', '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace('¡', '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace('?', '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace('¿', '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace('"', '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace("'", '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace(",", '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace("&", '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace("''", '') if x.dtype == "object" else x)

df_reviews = df_reviews.apply(lambda x: x.str.replace("_", '') if x.dtype == "object" else x)

df_reviews.head(1)

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,2011,1250.0,True,simple yet with great replayability. in my opi...


Convirtiendo 'item_id' a integer.

In [34]:
df_reviews['item_id'] = df_reviews['item_id'].astype(int)

In [None]:
import pyarrow.parquet as pq
df_reviews.to_parquet('data/df_reviews_limpio_compr.parquet')

Analisis de sentimiento

In [35]:
import pandas as pd
from textblob import TextBlob
import re
import nltk
from nltk.corpus import stopwords
from textblob import Word

Antes de hacer el análisis de sentimiento tengo que limpiar y lematizar el texto de la columna de reviews.

In [36]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))])
    text = ' '.join([Word(word).lemmatize() for word in text.split()])

    return text

Aplico el resultado a una columna nueva llamada 'clean_reviews'.

In [38]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...


True

In [39]:
# Aplicar limpieza a la columna 'user_reviews'
df_reviews['clean_reviews'] = df_reviews['review'].apply(clean_text)

Creo la función para el análisis de sentimiento y lo aplico a la columna 'clean_reviews', para crear la columna 'sentiment_analysis'

In [40]:
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity < 0:
        return 0  # Malo
    elif analysis.sentiment.polarity == 0:
        return 1  # Neutral
    else:
        return 2  # Positivo


In [41]:
df_reviews['sentiment_analysis'] = df_reviews['clean_reviews'].apply(get_sentiment)

In [42]:
df_reviews.head()

Unnamed: 0,user_id,posted,item_id,recommend,review,clean_reviews,sentiment_analysis
0,76561197970982479,2011,1250,True,simple yet with great replayability. in my opi...,simple yet great replayability opinion zombie ...,2
1,76561197970982479,2011,22200,True,its unique and worth a playthrough.,unique worth playthrough,2
2,76561197970982479,2011,43110,True,great atmosphere. the gunplay can be a bit chu...,great atmosphere gunplay bit chunky time end d...,2
3,js41637,2014,251610,True,i know what you think when you see this title ...,know think see title barbie dreamhouse party i...,2
4,js41637,2013,227300,True,for a simple (its actually not all that simple...,simple actually simple truck driving simulator...,0


Los nulos son categorizados como neutrales.

In [None]:
import pyarrow.parquet as pq
df_reviews.to_parquet('Datos/df_reviews_sentimiento.parquet')