In [1]:
# ETL DEL DATASET AUSTRALIAN_USER_REVIEWS

#Se desarrolla la extracción, transformación y carga de australian_user_reviews.json

In [2]:
import pandas as pd 
import json
import re 
import ast 

In [3]:
# FUNCIONES IMPORTANTES

# Para verificar tipos de datos y porcentajes
def verificar_tipo_dato(dataframe):
    diccionario = {'nombre_campo': [], 'tipo_dato': [], '%_no_nulos': [], '%_nulos': [], 'nulos': []}
    for columna in dataframe.columns:
        porcentaje_no_nulos = (dataframe[columna].count()/len(dataframe))*100
        diccionario['nombre_campo'].append(columna)
        diccionario['tipo_dato'].append(dataframe[columna].apply(type).unique())
        diccionario['%_no_nulos'].append(round(porcentaje_no_nulos,2))
        diccionario['%_nulos'].append(round(100-porcentaje_no_nulos,2))
        diccionario['nulos'].append(dataframe[columna].isnull().sum())
    
    df_info = pd.DataFrame(diccionario)
    return df_info




# Para ver la existencia de duplicados
def verificar_duplicados_columna(df, columna):
    lista_vacia = []
    filas_duplicadas = df[df.duplicated(subset=columna, keep=False)]
    for elemento in df[columna]:
        if len(filas_duplicadas) == len(lista_vacia):
            return 'No hay elementos duplicados'
        else:
            return filas_duplicadas.sort_values(by=columna, ascending=True)
        



def convertir_fecha(fecha):
    match = re.search(r'(\w+\s\d{1,2},\s\d{4})', fecha)
    if match:
        fecha_str = match.group(1)
        try:
            fecha_dt = pd.to_datetime(fecha_str)
            return fecha_dt.strftime('%Y-%m-%d')
        except:
            return 'Fecha inválida'
    else:
        return 'Formato inválido'

In [4]:
# EXTRACCIÓN DE DATOS

# Se extraen los datos de los JSON y se les convierte en dataframe
rows=[]
with open("C:\\Users\\cquir\\OneDrive\\Escritorio\\Data Science SH\\Proyecto Individual 1\\bases de datos\\australian_user_reviews.json", "r", encoding="UTF-8") as f:
    for line in f.readlines():
        rows.append(ast.literal_eval(line))
dfreviews = pd.DataFrame(rows)
dfreviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [5]:
# Se verifican tipos de datos y cantidad de nulos
verificar_tipo_dato(dfreviews)

Unnamed: 0,nombre_campo,tipo_dato,%_no_nulos,%_nulos,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,user_url,[<class 'str'>],100.0,0.0,0
2,reviews,[<class 'list'>],100.0,0.0,0


In [6]:
# Se verifica si existen duplicados 
verificar_duplicados_columna(dfreviews, 'user_id')

Unnamed: 0,user_id,user_url,reviews
12888,05041129,http://steamcommunity.com/id/05041129,"[{'funny': '', 'posted': 'Posted May 18, 2015...."
5250,05041129,http://steamcommunity.com/id/05041129,"[{'funny': '', 'posted': 'Posted May 18, 2015...."
3133,111222333444555666888,http://steamcommunity.com/id/11122233344455566...,"[{'funny': '', 'posted': 'Posted December 22, ..."
3134,111222333444555666888,http://steamcommunity.com/id/11122233344455566...,"[{'funny': '', 'posted': 'Posted December 22, ..."
4139,29123,http://steamcommunity.com/id/29123,"[{'funny': '', 'posted': 'Posted March 26.', '..."
...,...,...,...
2721,xXAussieRockXx,http://steamcommunity.com/id/xXAussieRockXx,"[{'funny': '', 'posted': 'Posted July 17, 2015..."
2680,yolofaceguy,http://steamcommunity.com/id/yolofaceguy,"[{'funny': '', 'posted': 'Posted October 31, 2..."
17916,yolofaceguy,http://steamcommunity.com/id/yolofaceguy,"[{'funny': '', 'posted': 'Posted October 31, 2..."
5855,zeroblade,http://steamcommunity.com/id/zeroblade,"[{'funny': '', 'posted': 'Posted November 30, ..."


In [7]:
# Se observan 623 filas duplicadas en 'user_id', pero hay que ver si para cada duplicado de user_id sus reviews son idénticas o difieren por ser más de un comentario hecho por el usuario. 
user_id = '05041129'
user_reviews = dfreviews[dfreviews['user_id'] == user_id]['reviews']
for review_list in user_reviews:
    for review in review_list:
        print(review['review'])
    print('-' * 40)

This game to me it is so good that it is better than any of the games out their and $15 worth it
this is the best third person game ever that i have played
this will be the  number one game if it have more competitive things
----------------------------------------
This game to me it is so good that it is better than any of the games out their and $15 worth it
this is the best third person game ever that i have played
this will be the  number one game if it have more competitive things
----------------------------------------


In [8]:
# En este caso los reviews son idénticos, por lo que se elimina la primera ocurrencia para cada uno de los user_id duplicados 
dfreviews = dfreviews.drop_duplicates(subset='user_id', keep='first')

verificar_duplicados_columna(dfreviews,'user_id')

'No hay elementos duplicados'

In [9]:
# Se revisa la estructura de los datos presentes en la columna reviews 
dfreviews['reviews'][0]

# Se puede observar que es una lista de diccionarios hechos por cada videojuego diferente

[{'funny': '',
  'posted': 'Posted November 5, 2011.',
  'last_edited': '',
  'item_id': '1250',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Simple yet with great replayability. In my opinion does "zombie" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth "zombie" splattering fun for the whole family. Amazed this sort of FPS is so rare.'},
 {'funny': '',
  'posted': 'Posted July 15, 2011.',
  'last_edited': '',
  'item_id': '22200',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': "It's unique and worth a playthrough."},
 {'funny': '',
  'posted': 'Posted April 21, 2011.',
  'last_edited': '',
  'item_id': '43110',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}]

In [10]:
# En este caso, debemos realizar 2 procedimientos a las diferentes variables de estos diccionarios:

# a. En primera instancia, transformar la columna general 'reviews', ya que está anidada y se busca generar una columna por cada diccionario para posteriormente crear un registro por cada diccionario 
# b. Transformar la columna 'reviews_posted', procesando la fecha y convirtiendola a un formato homogeneo y específico para todas 

In [11]:
# TRANSFORMACION 'reviews'
dfreviews2 = pd.json_normalize(dfreviews['reviews'])
dfreviews2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"{'funny': '', 'posted': 'Posted November 5, 20...","{'funny': '', 'posted': 'Posted July 15, 2011....","{'funny': '', 'posted': 'Posted April 21, 2011...",,,,,,,
1,"{'funny': '', 'posted': 'Posted June 24, 2014....","{'funny': '', 'posted': 'Posted September 8, 2...","{'funny': '', 'posted': 'Posted November 29, 2...",,,,,,,
2,"{'funny': '', 'posted': 'Posted February 3.', ...","{'funny': '', 'posted': 'Posted December 4, 20...","{'funny': '', 'posted': 'Posted November 3, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...",,,,
3,"{'funny': '', 'posted': 'Posted October 14, 20...","{'funny': '', 'posted': 'Posted July 28, 2012....","{'funny': '', 'posted': 'Posted June 2, 2012.'...","{'funny': '', 'posted': 'Posted June 29, 2014....","{'funny': '', 'posted': 'Posted November 22, 2...","{'funny': '', 'posted': 'Posted February 23, 2...",,,,
4,"{'funny': '3 people found this review funny', ...","{'funny': '1 person found this review funny', ...","{'funny': '2 people found this review funny', ...","{'funny': '', 'posted': 'Posted July 11, 2013....",,,,,,


In [12]:
# ya que transformamos solo la columna reviews, perdemos el 'user_id' y 'user_url'. Sin embargo, estos se encuentran en la misma posición original, por lo que se pueden concatenar con el dataframe anterior.

# Se añade 'user_id' y 'user_url' a las columnas previas 
dfreviews2 = pd.concat([dfreviews[['user_id', 'user_url']], dfreviews2], axis=1)
dfreviews2.head()

Unnamed: 0,user_id,user_url,0,1,2,3,4,5,6,7,8,9
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20...","{'funny': '', 'posted': 'Posted July 15, 2011....","{'funny': '', 'posted': 'Posted April 21, 2011...",,,,,,,
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014....","{'funny': '', 'posted': 'Posted September 8, 2...","{'funny': '', 'posted': 'Posted November 29, 2...",,,,,,,
2,evcentric,http://steamcommunity.com/id/evcentric,"{'funny': '', 'posted': 'Posted February 3.', ...","{'funny': '', 'posted': 'Posted December 4, 20...","{'funny': '', 'posted': 'Posted November 3, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...",,,,
3,doctr,http://steamcommunity.com/id/doctr,"{'funny': '', 'posted': 'Posted October 14, 20...","{'funny': '', 'posted': 'Posted July 28, 2012....","{'funny': '', 'posted': 'Posted June 2, 2012.'...","{'funny': '', 'posted': 'Posted June 29, 2014....","{'funny': '', 'posted': 'Posted November 22, 2...","{'funny': '', 'posted': 'Posted February 23, 2...",,,,
4,maplemage,http://steamcommunity.com/id/maplemage,"{'funny': '3 people found this review funny', ...","{'funny': '1 person found this review funny', ...","{'funny': '2 people found this review funny', ...","{'funny': '', 'posted': 'Posted July 11, 2013....",,,,,,


In [13]:
# dfreviews2.shape

In [14]:
#Tenemos los diccionarios con cada review ubicadas en columnas. Sin embargo, debemos generar un registro por cada diccionario, pasando de un formato ancho a uno largo mediante la función melt, manteniendo en cada caso el usuario que lo genesra.

dfreviews2 = pd.melt(dfreviews2, id_vars=['user_id', 'user_url'], value_vars=list(range(9)), value_name='reviews')
dfreviews2.head()

Unnamed: 0,user_id,user_url,variable,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,"{'funny': '', 'posted': 'Posted November 5, 20..."
1,js41637,http://steamcommunity.com/id/js41637,0,"{'funny': '', 'posted': 'Posted June 24, 2014...."
2,evcentric,http://steamcommunity.com/id/evcentric,0,"{'funny': '', 'posted': 'Posted February 3.', ..."
3,doctr,http://steamcommunity.com/id/doctr,0,"{'funny': '', 'posted': 'Posted October 14, 20..."
4,maplemage,http://steamcommunity.com/id/maplemage,0,"{'funny': '3 people found this review funny', ..."


In [15]:
# se puede observar que con la funcion anterior, las columnas 'reviews' pasan a ser listas, pasando de un formato ancho a largo en el cual se tienen los registros de opiniones unicas asociadas a cada usuario. 
dfreviews2[dfreviews2['user_id']=='76561197970982479']

Unnamed: 0,user_id,user_url,variable,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,"{'funny': '', 'posted': 'Posted November 5, 20..."
25799,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1,"{'funny': '', 'posted': 'Posted July 15, 2011...."
51598,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,"{'funny': '', 'posted': 'Posted April 21, 2011..."
77397,76561197970982479,http://steamcommunity.com/profiles/76561197970...,3,
103196,76561197970982479,http://steamcommunity.com/profiles/76561197970...,4,
128995,76561197970982479,http://steamcommunity.com/profiles/76561197970...,5,
154794,76561197970982479,http://steamcommunity.com/profiles/76561197970...,6,
180593,76561197970982479,http://steamcommunity.com/profiles/76561197970...,7,
206392,76561197970982479,http://steamcommunity.com/profiles/76561197970...,8,


In [16]:
# Lo anterior genera valores None en las columnas 'reviews', que se proceden a eliminar
dfreviews2 = dfreviews2.dropna()

# Se verifica que cada 'user_id' tenga asociado solamente sus diccionarios correspondientes
dfreviews2[dfreviews2['user_id']=='76561197970982479']

Unnamed: 0,user_id,user_url,variable,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,"{'funny': '', 'posted': 'Posted November 5, 20..."
25799,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1,"{'funny': '', 'posted': 'Posted July 15, 2011...."
51598,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,"{'funny': '', 'posted': 'Posted April 21, 2011..."


In [17]:
# Ahora podemos convertir cada diccionario de la columna 'reviews' en columnas independientes con las claves como nombres de las columnas. Al desanidar los diccionarios de la columna 'reviews', cada clave o key se convierte en columna y cada value en fila. Esto se realiza mediante la función apply(pd.series) 
dfreviews = dfreviews2['reviews'].apply(pd.Series, dtype='object')
dfreviews = dfreviews.add_prefix('reviews_')
dfreviews

Unnamed: 0,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud
...,...,...,...,...,...,...,...
231291,,"Posted August 15, 2014.","Last edited November 3, 2014.",440,No ratings yet,True,TF2 is alot of fun and its really good but the...
231293,,"Posted August 2, 2014.",,304930,No ratings yet,True,Fun game with friends
231419,,"Posted July 31, 2015.",,265630,No ratings yet,True,So Fun!! :D
231499,,"Posted December 20, 2015.",,304050,No ratings yet,True,"This game is great. The only thing is,Why cant..."


In [18]:
#Por lo anterior, nuevamente se pierde 'user_id' y 'user_url' por lo que se concatena de nuevo.
dfreviews = pd.concat([dfreviews2[['user_id', 'user_url']], dfreviews], axis=1)
dfreviews.head()

Unnamed: 0,user_id,user_url,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,evcentric,http://steamcommunity.com/id/evcentric,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [19]:
#Se observan valores faltantes en algunas columnas, probablemente deben tener un espacio ya que no son nulos como tal. 
dfreviews['reviews_last_edited'][0]

''

In [20]:
#Se reemplazan esos espacios por nulos.
dfreviews.replace('', None, inplace=True)
dfreviews.head()

Unnamed: 0,user_id,user_url,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,evcentric,http://steamcommunity.com/id/evcentric,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [21]:
# Verificamos como quedan nuestros datos luego de desanidar las columnas
verificar_tipo_dato(dfreviews)

# Se observa un gran % de valores nulos tanto en la columna 'reviews_funny' y 'reviews_last_edited', por lo que se decide eliminarlas. Por otro lado, la columna 'reviews_review' tiene un 5% de valores nulos, pero no se eliminan ya que se tomarán como un comentario neutral

Unnamed: 0,nombre_campo,tipo_dato,%_no_nulos,%_nulos,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,user_url,[<class 'str'>],100.0,0.0,0
2,reviews_funny,"[<class 'NoneType'>, <class 'str'>]",13.76,86.24,49498
3,reviews_posted,[<class 'str'>],100.0,0.0,0
4,reviews_last_edited,"[<class 'NoneType'>, <class 'str'>]",10.28,89.72,51499
5,reviews_item_id,[<class 'str'>],100.0,0.0,0
6,reviews_helpful,[<class 'str'>],100.0,0.0,0
7,reviews_recommend,[<class 'bool'>],100.0,0.0,0
8,reviews_review,"[<class 'str'>, <class 'NoneType'>]",99.95,0.05,30


In [22]:
dfreviews = dfreviews.drop(columns=['reviews_funny', 'reviews_last_edited'])
dfreviews.columns

Index(['user_id', 'user_url', 'reviews_posted', 'reviews_item_id',
       'reviews_helpful', 'reviews_recommend', 'reviews_review'],
      dtype='object')

In [23]:
# TRANSFORMACIÓN 'reviews_posted'

# necesitamos que la fecha sea en formato YYYY-MM-DD, sin embargo se encuentra como una cadena con otro formato. Procesaremos las fechas mediante expresiones regulares para extraer el año, mes y día
dfreviews['reviews_date'] = dfreviews['reviews_posted'].apply(convertir_fecha)
dfreviews['reviews_date']

0               2011-11-05
1               2014-06-24
2         Formato inválido
3               2013-10-14
4               2014-04-15
                ...       
231291          2014-08-15
231293          2014-08-02
231419          2015-07-31
231499          2015-12-20
231501    Formato inválido
Name: reviews_date, Length: 57397, dtype: object

In [24]:
dfreviews[dfreviews['reviews_date'] == 'Formato inválido']

#Hay 9771 registros que presentan un formato diferente al establecido ya que no presentan el año. Los registros no se podrán consultar desde la API, pero las otras columnas pueden aportar información relevante

Unnamed: 0,user_id,user_url,reviews_posted,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review,reviews_date
2,evcentric,http://steamcommunity.com/id/evcentric,Posted February 3.,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...,Formato inválido
6,76561198079601835,http://steamcommunity.com/profiles/76561198079...,Posted May 20.,730,0 of 1 people (0%) found this review helpful,True,ZIKA DO BAILE,Formato inválido
7,MeaTCompany,http://steamcommunity.com/id/MeaTCompany,Posted July 24.,730,No ratings yet,True,BEST GAME IN THE BLOODY WORLD,Formato inválido
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,Posted June 16.,252950,0 of 1 people (0%) found this review helpful,True,love it,Formato inválido
10,76561198077246154,http://steamcommunity.com/profiles/76561198077...,Posted June 11.,440,No ratings yet,True,mt bom,Formato inválido
...,...,...,...,...,...,...,...,...
223569,76561198040184950,http://steamcommunity.com/profiles/76561198040...,Posted April 12.,394690,No ratings yet,True,I cannot say much right now due to the game no...,Formato inválido
226105,76561198046474248,http://steamcommunity.com/profiles/76561198046...,Posted March 28.,234140,No ratings yet,True,"Oh what a day .., What a lovely day to play th...",Formato inválido
228109,dmitry_who,http://steamcommunity.com/id/dmitry_who,Posted May 17.,376210,10 of 28 people (36%) found this review helpful,True,░░░░░░░░░░░█▀▀░░█░░░░░░░░░░░▄▀▀▀▀░░░░░█▄▄░░░░░...,Formato inválido
229231,76561198079507136,http://steamcommunity.com/profiles/76561198079...,Posted January 3.,730,No ratings yet,False,got VACed,Formato inválido


In [25]:
# De esta forma se elimina la columna 'reviews_posted' ya que ya fue transformada y sustituida por la columna 'reviews_date'
dfreviews = dfreviews.drop(labels='reviews_posted', axis=1)

In [26]:
dfreviews.head()

Unnamed: 0,user_id,user_url,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review,reviews_date
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011-11-05
1,js41637,http://steamcommunity.com/id/js41637,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014-06-24
2,evcentric,http://steamcommunity.com/id/evcentric,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...,Formato inválido
3,doctr,http://steamcommunity.com/id/doctr,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...,2013-10-14
4,maplemage,http://steamcommunity.com/id/maplemage,211420,35 of 43 people (81%) found this review helpful,True,Git gud,2014-04-15


In [27]:
verificar_tipo_dato(dfreviews)

Unnamed: 0,nombre_campo,tipo_dato,%_no_nulos,%_nulos,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,user_url,[<class 'str'>],100.0,0.0,0
2,reviews_item_id,[<class 'str'>],100.0,0.0,0
3,reviews_helpful,[<class 'str'>],100.0,0.0,0
4,reviews_recommend,[<class 'bool'>],100.0,0.0,0
5,reviews_review,"[<class 'str'>, <class 'NoneType'>]",99.95,0.05,30
6,reviews_date,[<class 'str'>],100.0,0.0,0


In [29]:
# COLUMNA 'reviews_review'
# contiene 5% de valores nulos, por lo que se eliminan
dfreviews = dfreviews.dropna(subset=['reviews_review'])

#se verifican los nulos
verificar_tipo_dato(dfreviews)

Unnamed: 0,nombre_campo,tipo_dato,%_no_nulos,%_nulos,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,user_url,[<class 'str'>],100.0,0.0,0
2,reviews_item_id,[<class 'str'>],100.0,0.0,0
3,reviews_helpful,[<class 'str'>],100.0,0.0,0
4,reviews_recommend,[<class 'bool'>],100.0,0.0,0
5,reviews_review,[<class 'str'>],100.0,0.0,0
6,reviews_date,[<class 'str'>],100.0,0.0,0


In [30]:
#GUARDAR EL DATASET

dfreviews.to_csv (r"C:\\Users\\cquir\\OneDrive\\Escritorio\\Data Science SH\\Proyecto Individual 1\\bases de datos\\dfreviews_limpio.csv", index = None)