#2 Preprocesamiento


###2.0 Conectar con Drive y copiar archivos

In [1]:
from google.colab import drive

# Monto Google Drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Copiar el archivo de Drive a Colab

!cp "/content/drive/My Drive/Video_Games_processed.csv" "/content/Video_Games_processed.csv"

###2.1 Leer archivo y cargas en dataframe

In [3]:
# Instalar librerias necesarias

! pip install num2words

Collecting num2words
  Downloading num2words-0.5.13-py3-none-any.whl (143 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m133.1/143.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.3/143.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6.2 (from num2words)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=9d00060a2423b84c62a0acf2cc5d9f1816eb951aa090bbf9c5d0153d2ea847e7
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installi

In [4]:
# Importar librerias necesarias

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from num2words import num2words

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Inicializa el lematizador
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
# Verifico que el archivo se haya descargado correctamente

!ls -l "/content/"

total 10444
drwx------ 5 root root     4096 Mar 28 15:37 drive
drwxr-xr-x 1 root root     4096 Mar 26 13:28 sample_data
-rw------- 1 root root 10683869 Mar 28 15:37 Video_Games_processed.csv


In [6]:
# Leo el archivo generador en el Ejercicio 1

# Ruta completa al archivo CSV en google drive
ruta_archivo_csv = f"/content/Video_Games_processed.csv"

# Cargar el archivo CSV en un DataFrame
df = pd.read_csv(ruta_archivo_csv)

# Mostrar las primeras filas del DataFrame para verificar
print(df.head())

   overall                                         reviewText  sentiment
0      4.0  I had to learn the hard way after ordering thi...          1
1      4.0  I would recommend this learning game for anyon...          1
2      5.0  Choose your career which sets your money for t...          1
3      5.0  It took a few hours to get this up and running...          1
4      5.0  I oredered this for a daughter who is now 33 a...          1


### 2.1 Pipeline de procesamiento

In [7]:
# Funcion pipeline de preprocesamiento

def preprocess_text(text):

    # Me aseguro de que no haya problemas de enconding
    if not isinstance(text, str):
      text = str(text, encoding='utf-8', errors='replace')

    # Convertir el texto a minusculas para normalizar
    text = text.lower()

    # convertir digitos a palabras
    if text.isdigit():
      text = num2words(text, lang='en')

    # Eliminar puntuaciones y simbolos
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenizar
    tokens = word_tokenize(text)

    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lematización
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

### 2.2 Aplicar el preprocesamiento

In [8]:
# Aplicar preprocesamiento

df['reviewText'] = df['reviewText'].astype(str)  # Asegurar que el texto sea string

df['tokens'] = df['reviewText'].apply(preprocess_text)

In [9]:
# Mostrar las primeras filas del DataFrame para verificar

print(df.head())

   overall                                         reviewText  sentiment  \
0      4.0  I had to learn the hard way after ordering thi...          1   
1      4.0  I would recommend this learning game for anyon...          1   
2      5.0  Choose your career which sets your money for t...          1   
3      5.0  It took a few hours to get this up and running...          1   
4      5.0  I oredered this for a daughter who is now 33 a...          1   

                                              tokens  
0  [learn, hard, way, ordering, macbook, pro, doe...  
1  [would, recommend, learning, game, anyone, lik...  
2  [choose, career, set, money, trip, name, many,...  
3  [took, hour, get, running, window, computer, w...  
4  [oredered, daughter, wanted, play, oregon, tra...  


###2.3 Guardar el archivo prepocesado para posterior uso

In [10]:
# Guardo el archivo preprocesado para su posterior uso

ruta_destino_csv = f"/content/Video_Games_Preprocessed.csv"

df.to_csv(ruta_destino_csv, index=False)

In [11]:
!ls -l "/content/"

total 30032
drwx------ 5 root root     4096 Mar 28 15:37 drive
drwxr-xr-x 1 root root     4096 Mar 26 13:28 sample_data
-rw-r--r-- 1 root root 20054410 Mar 28 15:38 Video_Games_Preprocessed.csv
-rw------- 1 root root 10683869 Mar 28 15:37 Video_Games_processed.csv
