## Importing the necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import re

## Load the train and test datasets

In [2]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")

## Data Preprocessing

### Combine train and test datasets for preprocessing

In [3]:
combined_df = pd.concat([train_df, test_df], ignore_index=True)

In [4]:
combined_df.fillna(method='ffill', inplace=True) # Forward fill missing values

  combined_df.fillna(method='ffill', inplace=True) # Forward fill missing values


In [5]:
def extract_numeric_sqft(x):
    if isinstance(x, str):
        numeric_part = re.search(r'\d+(\.\d+)?', x)
        if numeric_part:
            return float(numeric_part.group())
    return float('nan')

In [6]:
combined_df['total_sqft'] = combined_df['total_sqft'].apply(extract_numeric_sqft)

In [7]:
categorical_cols = ['area_type', 'availability', 'location', 'size', 'society']
numerical_cols = ['total_sqft', 'bath', 'balcony']

In [8]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [10]:
X_train = combined_df.drop(columns=['price'])  # Features
y_train = combined_df['price']  # Target variable

In [11]:
model = RandomForestRegressor(random_state=42)

In [12]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [13]:
pipeline.fit(X_train, y_train)

In [22]:
X_test = test_df.copy()  # Extract test data
X_test['total_sqft'] = X_test['total_sqft'].apply(extract_numeric_sqft)

In [15]:
predictions = pipeline.predict(X_test)

In [16]:
predictions_df = pd.DataFrame({'Actual_Price': test_df['price'], 'Predicted_Price': predictions})

In [None]:
# predictions = pipeline.predict(X_test)

In [None]:
# predictions_df = pd.DataFrame({'Actual_Price': test_df['price'], 'Predicted_Price': predictions})

In [17]:
predictions_df.to_csv('predictions.csv', index=False)

In [18]:
print(predictions)

[ 38.42151667 158.43       101.28       ...  29.4438      29.97096667
  28.88783333]


In [19]:
print(test_df['price'])

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
1475   NaN
1476   NaN
1477   NaN
1478   NaN
1479   NaN
Name: price, Length: 1480, dtype: float64


In [1]:
predictions_df = pd.DataFrame({'price': predictions})

NameError: name 'pd' is not defined