# Análisis y limpieza de los datos

Importamos el train.csv

In [251]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [252]:
train_csv=pd.read_csv("../data/train.csv")

Primero vamos a analizar las columnas no numéricas a ver cuales podemos eliminar

In [253]:
non_numeric = train_csv.select_dtypes(include=object)
non_numeric.shape

(4167, 33)

Las primeras 7 columnas no nos dan información relevante por lo que las podemos quitar


In [254]:
columns_to_drop1=list(non_numeric.columns[0:8])

In [255]:
#Eliminamos columnas de texto no relevante (7 columnas)

train_mod=train_csv.drop(columns=columns_to_drop1,axis=1)

In [256]:
columns_to_drop2=list(train_mod.columns[0:3])

otras_columnas=["host_about","host_thumbnail_url", "host_picture_url", "calendar_last_scraped","license","neighbourhood_group_cleansed","bathrooms","calendar_updated"]

for i in otras_columnas:
    columns_to_drop2.append(i)


In [257]:
#Eliminamos columnas de texto no relevante  (columns_to_drop2)

train_mod=train_mod.drop(columns=columns_to_drop2,axis=1)

Como tenemos todos los valores para neighbourhood_cleansed, podemos elimnar la columna neighbourhood que tiene valores nulos

In [258]:
train_mod=train_mod.drop("neighbourhood",axis=1)

Nos encargamos de los nulos

In [259]:
# rellenamos con unknown

train_mod.host_response_time.fillna("unknown",inplace=True)
train_mod.host_neighbourhood.fillna("unknown",inplace=True)
train_mod.host_location.fillna("unknown",inplace=True)
train_mod.bathrooms_text.fillna("unknown",inplace=True)

In [260]:
#Para host_response_rate y host_acceptance_rate queremos 
#rellenar los NAN con la media, pero primero tenemos que quitar los signos "%" y pasarlo a float

train_mod.host_response_rate=train_mod.host_response_rate.str.replace("%","")
train_mod.host_acceptance_rate=train_mod.host_acceptance_rate.str.replace("%","")

train_mod.host_response_rate.fillna("0",inplace=True)
train_mod.host_acceptance_rate.fillna("0",inplace=True)

train_mod.host_response_rate.astype(dtype="int8")
train_mod.host_acceptance_rate.astype(dtype="int8")


train_mod.host_response_rate.fillna(train_mod.host_response_rate.mean(),inplace=True)
train_mod.host_acceptance_rate.fillna(train_mod.host_response_rate.mean(), inplace=True)



In [261]:
train_mod.head(2)

Unnamed: 0,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2012-07-24,"Amsterdam, North Holland, The Netherlands",within a few hours,100,75,f,unknown,13,13,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,Noord-Oost,52.39508,4.99186,Private room in farm stay,Private room,3,1.5 shared baths,1.0,,"[""First aid kit"", ""Free parking on premises"", ...",87.0,2,1125,2.0,2.0,1125.0,1125.0,2.0,1125.0,t,0,0,0,249,81,0,0,2014-08-03,2019-06-15,4.62,4.56,4.29,4.57,4.75,4.69,4.49,f,10,0,10,0,0.91
1,2015-06-01,"Amsterdam, Noord-Holland, Netherlands",unknown,0,0,f,Weesperbuurt en Plantage,0,0,"['email', 'phone', 'reviews']",t,f,Centrum-Oost,52.36371,4.90745,Entire rental unit,Entire home/apt,4,1 bath,2.0,2.0,"[""Hot water kettle"", ""Pocket wifi"", ""Stove"", ""...",250.0,2,1125,2.0,2.0,1125.0,1125.0,2.0,1125.0,t,3,3,3,3,9,4,1,2020-01-26,2021-09-05,5.0,5.0,5.0,4.89,5.0,5.0,4.89,f,1,1,0,0,0.4


### Columnas Reviews

Vamos a probar poniendo valor 0 a aquellas que no tienen review. Aunque esto no es cierto ya que parece que le estamos poniendo una puntuación de 0, la gente se guia mucho por estas review por lo que no tener afectará negativamente al preci (en teoria). Más adelante probaremos otras metodologías para ver con que asumciones es más preciso nuestro modelo. 

In [262]:
train_mod.first_review.fillna("0",inplace=True)               
train_mod.last_review.fillna("0",inplace=True)                         
train_mod.review_scores_rating.fillna("0",inplace=True)                 
train_mod.review_scores_accuracy.fillna("0",inplace=True)              
train_mod.review_scores_cleanliness.fillna("0",inplace=True)            
train_mod.review_scores_checkin.fillna("0",inplace=True)               
train_mod.review_scores_communication.fillna("0",inplace=True)          
train_mod.review_scores_location.fillna("0",inplace=True)               
train_mod.review_scores_value.fillna("0",inplace=True)                  
train_mod.reviews_per_month.fillna("0",inplace=True)      

In [263]:
nan_cols=train_mod.isna().sum()
nan_cols[nan_cols>0]

bedrooms                  253
beds                      174
minimum_minimum_nights      1
maximum_minimum_nights      1
minimum_maximum_nights      1
maximum_maximum_nights      1
minimum_nights_avg_ntm      1
maximum_nights_avg_ntm      1
dtype: int64

### Columnas Bedrooms y beds

Para estas dos columnas, vamos a probar primero poniendo 0 a los valores nulos.

In [264]:
train_mod.bedrooms.fillna(train_mod.bedrooms.mean(),inplace=True)
train_mod.beds.fillna(train_mod.beds.mean(),inplace=True)

Para los nulos en las columnas minumum/maximum nights como solo hay uno, quitamos esa fila ya que todos están en la misma

In [265]:
train_mod.dropna(axis=0,inplace=True)

In [271]:
train_mod=train_mod.drop(columns=["host_since","first_review"],axis=1)

In [272]:
train_mod.head(2)


Unnamed: 0,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,"Amsterdam, North Holland, The Netherlands",within a few hours,100,75,f,unknown,13,13,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,Noord-Oost,52.39508,4.99186,Private room in farm stay,Private room,3,1.5 shared baths,1.0,1.956424,"[""First aid kit"", ""Free parking on premises"", ...",87.0,2,1125,2.0,2.0,1125.0,1125.0,2.0,1125.0,t,0,0,0,249,81,0,0,2019-06-15,4.62,4.56,4.29,4.57,4.75,4.69,4.49,f,10,0,10,0,0.91
1,"Amsterdam, Noord-Holland, Netherlands",unknown,0,0,f,Weesperbuurt en Plantage,0,0,"['email', 'phone', 'reviews']",t,f,Centrum-Oost,52.36371,4.90745,Entire rental unit,Entire home/apt,4,1 bath,2.0,2.0,"[""Hot water kettle"", ""Pocket wifi"", ""Stove"", ""...",250.0,2,1125,2.0,2.0,1125.0,1125.0,2.0,1125.0,t,3,3,3,3,9,4,1,2021-09-05,5.0,5.0,5.0,4.89,5.0,5.0,4.89,f,1,1,0,0,0.4


In [270]:
train_mod.host_verifications.unique()

array(["['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']",
       "['email', 'phone', 'reviews']", "['email', 'phone']",
       "['email', 'phone', 'facebook']",
       "['email', 'phone', 'reviews', 'manual_offline', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']",
       "['email', 'phone', 'facebook', 'reviews', 'jumio']",
       "['email', 'phone', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']",
       "['email', 'phone', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']",
       "['email', 'phone', 'reviews', 'jumio', 'work_email']",
       "['email', 'phone', 'facebook', 'reviews', 'jumio', 'work_email']",
       "['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'government_id']",
       "['email', 'phone', 'facebook', 'reviews', 'jumio', 'offline_government_id', 'government_id', 'work_email']",
       "['email', 'phon

In [None]:
split_host_verifications=pd.DataFrame(train_mod.host_verifications.tolist(),columns=[])

In [267]:
# new df from the column of lists
split_df = pd.DataFrame(df['Values'].tolist(), columns=['v1', 'v2', 'v3'])
# concat df and split_df
df = pd.concat([df, split_df], axis=1)
# display df
df

0       ['email', 'phone', 'reviews', 'jumio', 'offlin...
1                           ['email', 'phone', 'reviews']
2                                      ['email', 'phone']
3                          ['email', 'phone', 'facebook']
4       ['email', 'phone', 'reviews', 'manual_offline'...
                              ...                        
4162    ['email', 'phone', 'facebook', 'reviews', 'jum...
4163    ['email', 'phone', 'reviews', 'jumio', 'work_e...
4164    ['email', 'phone', 'reviews', 'jumio', 'offlin...
4165                                   ['email', 'phone']
4166    ['email', 'phone', 'reviews', 'jumio', 'offlin...
Name: host_verifications, Length: 4166, dtype: object