In [6]:
# Importing packages here
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re
import os
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')


In [56]:
# Reading in data
train_file_path = os.path.expanduser('~/data/train.pickle')

with open(train_file_path, 'rb') as file:
    data = pickle.load(file)
   

In [57]:
# Creating pandas dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,price,title,loc_string,loc,features,type,subtype,selltype,desc
0,320.000 €,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,"[85 m2, 2 hab., 1 baño, 3.647 €/m2]",FLAT,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...
1,335.000 €,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,"[65 m2, 2 hab., 1 baño, 5.000 €/m2]",FLAT,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,330.000 €,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,"[77 m2, 2 hab., 1 baño, 4.286 €/m2]",FLAT,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de..."
3,435.000 €,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,"[96 m2, 3 hab., 2 baños, 4.531 €/m2]",FLAT,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,410.000 €,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,"[84 m2, 2 hab., 1 baño, 4.881 €/m2]",FLAT,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin..."


### Data Engineering/Processing

In [58]:
def extract_feature(feature_list, position):
    """
    Extracts and returns the numerical feature at the specified position from the given list.

    Parameters:
    - feature_list (list): A list containing entries with numerical values.
    - position (int): The index indicating the position of the desired feature in the list.

    Returns:
    - int or None: The numerical feature at the specified position if successful, None otherwise.

    Raises:
    - IndexError: If the specified position is out of bounds for the feature_list.
    - ValueError: If the value at the specified position cannot be converted to an integer.

    Example:
    >>> extract_feature(['10 apples', '20 oranges', '30 bananas'], 1)
    20
    """    
    try:
        return int(feature_list[position].split(' ')[0])
    except (IndexError, ValueError):
        print("Problematic entry:", feature_list)
        return None

In [59]:
# Fixing errors, processing data for model building
df['price'] = df['price'].replace('[\€,]', '', regex=True).astype(float)
df['size_m2'] = df['features'].apply(lambda x: extract_feature(x, 0))
df['num_rooms'] = df['features'].apply(lambda x: extract_feature(x, 1))
df['num_bathrooms'] = df['features'].apply(lambda x: extract_feature(x, 2))

df[['price', 'size_m2', 'num_rooms', 'num_bathrooms']].head()

Problematic entry: ['52 m2', '3.442 €/m2']
Problematic entry: ['47 m2', '1 hab.', '6.051 €/m2']
Problematic entry: ['45 m2', '1 baño', '5.689 €/m2']
Problematic entry: ['113 m2', '4 baños', '2.389 €/m2']
Problematic entry: ['93 m2', '1 baño', '3.215 €/m2']
Problematic entry: ['63 m2', '2 hab.', '4.524 €/m2']
Problematic entry: ['54 m2', '1 baño', '6.944 €/m2']
Problematic entry: ['93 m2', '1 baño', '2.151 €/m2']
Problematic entry: ['102 m2', '1 baño', '3.431 €/m2']
Problematic entry: ['52 m2', '3.442 €/m2']
Problematic entry: ['100 m2', '2 hab.', '5.000 €/m2']


Unnamed: 0,price,size_m2,num_rooms,num_bathrooms
0,320.0,85,2.0,1.0
1,335.0,65,2.0,1.0
2,330.0,77,2.0,1.0
3,435.0,96,3.0,2.0
4,410.0,84,2.0,1.0


In [60]:
def extract_rooms_bathrooms(features, room_keyword='hab.', bath_keyword='baño'):
    """
    Extracts the number of rooms and bathrooms from a list of features based on specified keywords.

    Parameters:
    - features (list): A list containing textual features that may include information about rooms and bathrooms.
    - room_keyword (str, optional): The keyword used to identify entries related to the number of rooms. Default is 'hab.'.
    - bath_keyword (str, optional): The keyword used to identify entries related to the number of bathrooms. Default is 'baño'.

    Returns:
    - tuple: A tuple containing the extracted number of rooms and number of bathrooms, in that order.
      If information is not found, the corresponding value in the tuple will be None.
    """    
    num_rooms, num_bathrooms = None, None

    if features:
        for feature in features:
            if room_keyword in feature:
                num_rooms = int(feature.split(' ')[0])
            elif bath_keyword in feature:
                num_bathrooms = int(feature.split(' ')[0])

    return num_rooms, num_bathrooms

df['num_rooms'], df['num_bathrooms'] = zip(*df['features'].apply(extract_rooms_bathrooms))
df[['price', 'size_m2', 'num_rooms', 'num_bathrooms']].head()

Unnamed: 0,price,size_m2,num_rooms,num_bathrooms
0,320.0,85,2.0,1.0
1,335.0,65,2.0,1.0
2,330.0,77,2.0,1.0
3,435.0,96,3.0,2.0
4,410.0,84,2.0,1.0


In [61]:
from sklearn.preprocessing import OneHotEncoder
# One Hot Encoding categorical features
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(df[['type', 'subtype', 'selltype']])

categories = encoder.categories_
feature_names = []
for cat, labels in zip(['type', 'subtype', 'selltype'], categories):
    for label in labels:
        feature_names.append(f"{cat}_{label}")

encoded_df = pd.DataFrame(encoded_categorical, columns=feature_names)

df_combined = pd.concat([df.reset_index(drop=True), encoded_df], axis=1)

# Iterate over each column in encoded_df and print unique values with their counts
for column in encoded_df.columns:
    print(f"{column}:\n{encoded_df[column].value_counts()}\n")


type_APARTMENT:
type_APARTMENT
0.0    847
1.0     19
Name: count, dtype: int64

type_DUPLEX:
type_DUPLEX
0.0    855
1.0     11
Name: count, dtype: int64

type_FLAT:
type_FLAT
1.0    806
0.0     60
Name: count, dtype: int64

type_GROUND_FLOOR:
type_GROUND_FLOOR
0.0    859
1.0      7
Name: count, dtype: int64

type_LOFT:
type_LOFT
0.0    863
1.0      3
Name: count, dtype: int64

type_PENTHOUSE:
type_PENTHOUSE
0.0    852
1.0     14
Name: count, dtype: int64

type_STUDIO:
type_STUDIO
0.0    860
1.0      6
Name: count, dtype: int64

subtype_APARTMENT:
subtype_APARTMENT
0.0    847
1.0     19
Name: count, dtype: int64

subtype_DUPLEX:
subtype_DUPLEX
0.0    855
1.0     11
Name: count, dtype: int64

subtype_FLAT:
subtype_FLAT
1.0    806
0.0     60
Name: count, dtype: int64

subtype_GROUND_FLOOR:
subtype_GROUND_FLOOR
0.0    859
1.0      7
Name: count, dtype: int64

subtype_LOFT:
subtype_LOFT
0.0    863
1.0      3
Name: count, dtype: int64

subtype_PENTHOUSE:
subtype_PENTHOUSE
0.0    852
1.0     

In [62]:
import re
import pandas as pd
import spacy
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """
    Preprocesses a given text by converting it to lowercase, removing non-alphabetic characters,
    replacing newlines with spaces, and stripping leading/trailing whitespaces.

    Parameters:
    - text (str): The input text to be preprocessed.

    Returns:
    - str: The preprocessed text.
    """
    text = text.lower()
    text = re.sub(r'[^a-z\s]|\n', ' ', text)
    text = re.sub(r'\s+\n', ' ', text).strip()
    return text

def tokenize_and_lemmatize(text):
    """
    Tokenizes and lemmatizes the given text using spaCy.

    Parameters:
    - text (str): The input text to be tokenized and lemmatized.

    Returns:
    - str: The lemmatized text.
    """
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

df['combined_text'] = df[['title', 'loc_string', 'loc', 'desc']].fillna('').agg(' '.join, axis=1)
df['combined_text'] = df['combined_text'].apply(preprocess_text)
df['combined_text'] = df['combined_text'].apply(tokenize_and_lemmatize)

all_words = ' '.join(df['combined_text']).split()
word_counts = Counter(all_words)

excluded_words = {'de', 'y', 'en', 'la', 'con', 'las', 'por', 'a', 'al', 'algo', 'alguno',
                  'aqui', 'asi', 'como', 'cual', 'da', 'del', 'e', 'el', 'era', 'es', 'ese', 'esta',
                  'este', 'esto', 'ha', 'hay', 'he', 'les', 'lo', 'o', 'que', 'se', 'si', 'son', 'un',
                  'ya'}

filtered_word_counts = {word: count for word, count in word_counts.items() if word not in excluded_words and len(word) > 2}
top_words = [word for word, count in Counter(filtered_word_counts).most_common(100)]

vectorizer = CountVectorizer(vocabulary=top_words)

text_bow = vectorizer.fit_transform(df['combined_text'])

text_bow_df = pd.DataFrame(text_bow.toarray(), columns=vectorizer.get_feature_names_out())

df_combined = pd.concat([df_combined.drop(columns=excluded_words, errors='ignore').reset_index(drop=True), text_bow_df], axis=1)

df_combined.head()

Unnamed: 0,price,title,loc_string,loc,features,type,subtype,selltype,desc,size_m2,...,individual,tambi,dos,carpinter,ubicado,suelos,situado,parque,orientaci,natural
0,320.0,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,"[85 m2, 2 hab., 1 baño, 3.647 €/m2]",FLAT,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...,85,...,0,0,0,0,0,1,0,0,0,0
1,335.0,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,"[65 m2, 2 hab., 1 baño, 5.000 €/m2]",FLAT,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d...",65,...,1,0,0,1,1,2,0,0,1,1
2,330.0,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,"[77 m2, 2 hab., 1 baño, 4.286 €/m2]",FLAT,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de...",77,...,0,0,1,0,0,0,0,0,0,1
3,435.0,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,"[96 m2, 3 hab., 2 baños, 4.531 €/m2]",FLAT,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl...",96,...,0,0,0,1,0,0,0,0,0,0
4,410.0,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,"[84 m2, 2 hab., 1 baño, 4.881 €/m2]",FLAT,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin...",84,...,0,0,0,0,0,0,0,0,0,0


In [63]:
# Making features of luxury amenities in an apartment

df_combined['pool']=df['desc'].str.contains('piscina', case=False)
df_combined['elevator']=df['desc'].str.contains('ascensor', case=False)
df_combined['balcony']=df['desc'].str.contains('balcón', case=False)

In [64]:
# Dropping irrelevant features
df_combined=df_combined.drop(columns=['loc', 'title', 'desc', 'subtype', 'selltype', 'type'])

In [65]:
# Dropping null values
df_combined.columns[df_combined.isna().any()].tolist()

['num_rooms', 'num_bathrooms']

In [66]:
# Filling null values with means
df_combined['size_m2']=df_combined['size_m2'].fillna(df_combined['size_m2'].mean())
df_combined['num_rooms']=df_combined['num_rooms'].fillna(df_combined['num_rooms'].mean())
df_combined['num_bathrooms']=df_combined['num_bathrooms'].fillna(df_combined['num_bathrooms'].mean())


In [67]:
# Extracting features and response variable
y = df_combined['price']
X = df_combined.drop(columns=['price'])

In [68]:
X = X.drop(columns=['loc_string', 'features'])

X = X.drop(columns=['subtype_DUPLEX', 'subtype_LOFT', 'type_DUPLEX', 'type_LOFT'])

In [69]:
numerical_columns = df_combined.select_dtypes(include='number')
numerical_columns 

Unnamed: 0,price,size_m2,num_rooms,num_bathrooms,type_APARTMENT,type_DUPLEX,type_FLAT,type_GROUND_FLOOR,type_LOFT,type_PENTHOUSE,...,individual,tambi,dos,carpinter,ubicado,suelos,situado,parque,orientaci,natural
0,320.0,85,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
1,335.0,65,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,1,1,2,0,0,1,1
2,330.0,77,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1
3,435.0,96,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,410.0,84,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,342.0,115,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,2
862,315.0,82,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,1,0,0,1,0,0,0
863,360.0,79,4.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,1,3,0,0,0,0,0,0,3
864,270.0,63,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,1,1,1,1,0


# Model 1: R-squared.

In [70]:
from sklearn.linear_model import LinearRegression
# Building a linear regression model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [71]:
from sklearn import metrics
# r-squared value:
metrics.r2_score(y_test, y_pred)

0.4536320956869402

## Model 1: Parameter Search

In [72]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
import numpy as np


best_r2_mean = 0
best_alpha = 0

alphas = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]

for alpha in alphas:
    model = Ridge(alpha=alpha)
    # Perform 10-fold cross-validation and calculate the mean R^2 score for each fold
    r2_scores = cross_val_score(model, X, y, cv=10, scoring='r2')
    r2_mean = np.mean(r2_scores)
    if best_r2_mean < r2_mean:
        best_r2_mean = r2_mean
        best_alpha = alpha

print(best_r2_mean, best_alpha)

0.44461535740110325 100


In [73]:
model = Ridge(alpha=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
metrics.r2_score(y_test, y_pred)

0.5130789214875837

# Model 2: R-squared

In [98]:
import xgboost as xgb
from sklearn import metrics

odel = xgb.XGBRegressor(booster='gbtree',
                                     objective='reg:squarederror',
                                     learning_rate=0.1,
                                     alpha=7,
                                     n_estimators=70)

odel.fit(X_train, y_train)
y_pred = odel.predict(X_test)
r2 = metrics.r2_score(y_test, y_pred)
r2

0.6128970267815621

## Model 2: Parameter Search

In [27]:
from sklearn.model_selection import cross_val_score, KFold
import xgboost as xgb
import numpy as np

# Define the 10-fold cross-validation split
kf = KFold(n_splits=10, shuffle=True, random_state=42)

best_r2_mean = 0
best_hyperparams = None

learning_rates = [1e-2, 1e-1, 1]
alphas = [x for x in [7, 8, 9, 10]]
n_estimators_list = [x for x in [50, 60, 65, 70]]

for learning_rate in learning_rates:
    for alpha in alphas:
        for n_estimators in n_estimators_list:
            model = xgb.XGBRegressor(booster='gbtree',
                                     objective='reg:squarederror',
                                     learning_rate=learning_rate,
                                     alpha=alpha,
                                     n_estimators=n_estimators)
            
            # Compute the average R² score over all 10 folds
            r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
            r2_mean = np.mean(r2_scores)
            
            if r2_mean > best_r2_mean:
                best_r2_mean = r2_mean
                best_hyperparams = (learning_rate, alpha, n_estimators)

print(f"Best average R-Squared is {best_r2_mean} and the corresponding hyper-parameters are {best_hyperparams}")


Best average R-Squared is 0.5936617022433459 and the corresponding hyper-parameters are (0.1, 7, 70)


# Model3: R

In [74]:
from sklearn.ensemble import RandomForestRegressor
# Building a random forest regression model
regr = RandomForestRegressor(n_estimators=300, max_depth=20,ccp_alpha = 0.1, random_state=0)
regr.fit(X,y)
preds_rf = regr.predict(X_test)
metrics.r2_score(y_test, preds_rf)

0.9530985416133207

## Model3: Parameter Search

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Define the k-fold cross-validation setup
kf = KFold(n_splits=10, shuffle=True, random_state=42)

max_r2_mean = 0
max_comb = None

for n_estimators in [200, 250, 300]:
    for max_depth in [15, 17, 19, 20]:
        for ccp_alpha in (0, 0.01, 0.1):
            regr = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, ccp_alpha=ccp_alpha, random_state=0)
            
            # Compute the average R² score over all 10 folds
            r2_scores = cross_val_score(regr, X, y, cv=kf, scoring='r2')
            r2_mean = np.mean(r2_scores)
            
            if r2_mean > max_r2_mean:
                max_r2_mean = r2_mean
                max_comb = (n_estimators, max_depth, ccp_alpha)

print(f"Best average R-Squared is {max_r2_mean} with hyperparameters {max_comb}")


Best average R-Squared is 0.590696909280854 with hyperparameters (300, 20, 0.1)


# Final Model: R-squared

In [32]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'learning_rate': [0.01, 0.1, 1]
}

xgrf = xgb.XGBRFRegressor(random_state=0)

# Setup the grid search with 10-fold cross-validation
grid_search = GridSearchCV(estimator=xgrf, param_grid=param_grid, cv=10, scoring='r2', verbose=2, n_jobs=-1)
grid_search.fit(X, y)
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_parameters}")
print(f"Best R^2 Score: {best_score}")

Fitting 10 folds for each of 27 candidates, totalling 270 fits


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

Best Parameters: {'learning_rate': 1, 'max_depth': 30, 'n_estimators': 200}
Best R^2 Score: 0.5173577140481849


In [76]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
xgrf = xgb.XGBRFRegressor(n_estimators= 200, max_depth = 30, random_state=0, learning_rate=1).fit(X, y)
preds_rfxg = xgrf.predict(X_test)
metrics.r2_score(y_test, preds_rfxg)

0.9848802257353994

### Predictions

In [87]:
train_file_path = os.path.expanduser('~/data/test_kaggle.pickle')

with open(train_file_path, 'rb') as file:
    data = pickle.load(file)
x_test = pd.DataFrame(data)


In [88]:
x_test['desc'][1]

'¡Un gran piso a reformar es una gran oportunidad!\n\nIdeal también para inversión, para alquilar, para compartir o para tu vivienda habitual: ¡Imagina hacer realidad en tu hogar todas esas ideas que te encantan en las revistas y en programas de televisión!\n\nLa propiedad de 78 m2 construidos (según catastro), en segunda planta, tiene 4 habitaciones (3 dobles y 1 individual) y ¡muchas posibilidades!\n\nPodrás unir la cocina al salón, hacer una isla, o una cocina integral tipo Pantry…Podrás tener una mega-habitación-suite con vestidor, un walking closet de película… Un despacho en dónde va a caber todo aquello que necesitas para tenerlo todo en orden… Una habitación infantil preciosa … una sala para mirar tus series favoritas en un sofá king size…. ¡lo que se te ocurra!\n\nAunque no tiene terraza ni balcón, como tiene vistas al parque, tomarte un café rodeado de árboles hará sin duda, que tu día a día sea más relajado.\n\nTe encantará la luminosidad en todas las estancias. Todo el piso

In [89]:
#x_test['price'] = x_test['price'].replace('[\€,]', '', regex=True).astype(float)
x_test['size_m2'] = x_test['features'].apply(lambda x: extract_feature(x, 0))
x_test['num_rooms'] = x_test['features'].apply(lambda x: extract_feature(x, 1))
x_test['num_bathrooms'] = x_test['features'].apply(lambda x: extract_feature(x, 2))
x_test['num_rooms'], x_test['num_bathrooms'] = zip(*x_test['features'].apply(extract_rooms_bathrooms))
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(x_test[['type', 'subtype', 'selltype']])

categories = encoder.categories_
feature_names = []
for cat, labels in zip(['type', 'subtype', 'selltype'], categories):
    for label in labels:
        feature_names.append(f"{cat}_{label}")

encoded_df = pd.DataFrame(encoded_categorical, columns=feature_names)

x_test_combined = pd.concat([x_test.reset_index(drop=True), encoded_df], axis=1)


encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(x_test[['type', 'subtype', 'selltype']])
categories = encoder.categories_
feature_names = [f"{cat}_{label}" for cat, labels in zip(['type', 'subtype', 'selltype'], categories) for label in labels]
encoded_df = pd.DataFrame(encoded_categorical, columns=feature_names)


x_test['combined_text'] = x_test[['title', 'loc_string', 'loc', 'desc']].fillna('').agg(' '.join, axis=1)
x_test['combined_text'] = x_test['combined_text'].apply(preprocess_text)
x_test['combined_text'] = x_test['combined_text'].apply(tokenize_and_lemmatize)

text_bow_test = vectorizer.transform(x_test['combined_text'])
text_bow_test_df = pd.DataFrame(text_bow_test.toarray(), columns=vectorizer.get_feature_names_out())

encoded_categorical_test = encoder.transform(x_test[['type', 'subtype', 'selltype']])
encoded_test_df = pd.DataFrame(encoded_categorical_test, columns=feature_names)

x_test_combined = pd.concat([x_test.reset_index(drop=True), encoded_test_df, text_bow_test_df], axis=1)


In [90]:
x_test_combined=x_test_combined.drop(columns=['loc_string', 'features'])
x_test_combined['pool']=x_test['desc'].str.contains('piscina', case=False)
x_test_combined['elevator']=x_test['desc'].str.contains('ascensor', case=False)
x_test_combined['balcony']=x_test['desc'].str.contains('balcón', case=False)
x_test_combined=x_test_combined.drop(columns=['loc', 'title', 'desc', 'subtype', 'selltype', 'type'])


In [91]:
x_test_combined = x_test_combined.drop(columns = ['description', 'id', 'combined_text'])

In [92]:
x_test_combined.columns[x_test_combined.isna().any()].tolist()
x_test_combined['size_m2']=x_test_combined['size_m2'].fillna(x_test_combined['size_m2'].mean())
x_test_combined['num_rooms']=x_test_combined['num_rooms'].fillna(x_test_combined['num_rooms'].mean())
x_test_combined['num_bathrooms']=x_test_combined['num_bathrooms'].fillna(x_test_combined['num_bathrooms'].mean())

columns_to_drop = [col for col in x_test_combined.columns if '\nVer mapa' in col]
x_test_combined = x_test_combined.drop(columns=columns_to_drop)

In [94]:
x_test_combined.head()

Unnamed: 0,size_m2,num_rooms,num_bathrooms,type_APARTMENT,type_FLAT,type_GROUND_FLOOR,type_PENTHOUSE,type_STUDIO,subtype_APARTMENT,subtype_FLAT,...,carpinter,ubicado,suelos,situado,parque,orientaci,natural,pool,elevator,balcony
0,87,4,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,1,False,True,True
1,78,4,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,1,2,0,0,False,True,True
2,65,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0,1,0,0,0,0,0,False,True,True
3,88,3,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,False,False,False
4,82,2,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,False,True,True


In [99]:
# Predictions on test data
y_preds1 = model.predict(x_test_combined)
y_preds2 = odel.predict(x_test_combined)
y_preds3 = regr.predict(x_test_combined)
y_preds4 = xgrf.predict(x_test_combined)
y_preds =  y_preds1+ y_preds2+ y_preds4+y_preds3

y_preds_df = pd.DataFrame({'price': y_preds})
y_preds_df.index.name = 'id'
y_preds_df
y_preds_df.to_csv('solution.csv')