In [16]:
import pandas as pd
import numpy as np

In [17]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
from sklearn.model_selection import train_test_split

In [18]:
# importing the test and train csv files
train_data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")
x = train_data.drop("price", axis=1)
y= train_data['price']
x1= test_data

In [19]:
# filling missing values
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# filling categorical values with missing and numerical value with mean
cat_imputer = SimpleImputer(strategy='constant', fill_value="missing")
num_imputer = SimpleImputer(fill_value="mean")
num2_imputer = SimpleImputer(fill_value = "mean")

#define columns
cat_features = ['seller_type','layout_type','property_type','locality','furnish_type','bathroom']
num_features = ['area']
num2_features = ['bedroom']

# CREATE an imputer (something that fill missing data)
imputer = ColumnTransformer([('cat_imputer', cat_imputer, cat_features),('num_imputer',num_imputer,num_features),('num2_imputer',num2_imputer,num2_features)])

# Transform the data 
filled_x = imputer.fit_transform(x)
filled_x

filled_x1 = imputer.fit_transform(x1)
filled_x1


array([['AGENT', 'BHK', 'Independent Floor', ..., '3 bathrooms', 1900.0,
        3.0],
       ['AGENT', 'BHK', 'Independent Floor', ..., '5 bathrooms', 6521.0,
        5.0],
       ['OWNER', 'BHK', 'Independent House', ..., '1 bathrooms', 450.0,
        1.0],
       ...,
       ['AGENT', 'BHK', 'Independent House', ..., '3 bathrooms', 1700.0,
        3.0],
       ['AGENT', 'BHK', 'Independent Floor', ..., '4 bathrooms', 3251.0,
        4.0],
       ['AGENT', 'BHK', 'Independent Floor', ..., '2 bathrooms', 720.0,
        2.0]], dtype=object)

In [20]:
# Making the dataframe after filling values in x and x1
train_data_filled = pd.DataFrame(filled_x, columns=['seller_type','layout_type','property_type','locality','furnish_type','bathroom','area','bedroom'])
test_data_filled = pd.DataFrame(filled_x1, columns=['seller_type','layout_type','property_type','locality','furnish_type','bathroom','area','bedroom'])
test_data_filled.head()

Unnamed: 0,seller_type,layout_type,property_type,locality,furnish_type,bathroom,area,bedroom
0,AGENT,BHK,Independent Floor,Safdarjung Enclave,Semi-Furnished,3 bathrooms,1900.0,3.0
1,AGENT,BHK,Independent Floor,Greater Kailash II,Unfurnished,5 bathrooms,6521.0,5.0
2,OWNER,BHK,Independent House,Jhil Mil Colony,Unfurnished,1 bathrooms,450.0,1.0
3,AGENT,BHK,Independent Floor,Greater Kailash,Furnished,1 bathrooms,1000.0,1.0
4,AGENT,BHK,Apartment,Sector 10 Dwarka,Semi-Furnished,2 bathrooms,1600.0,3.0


In [21]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Combining train and test data so we can encode it using one hot encoder so no feature error will occur
combined_data = pd.concat([train_data_filled, test_data_filled], ignore_index=True)

# Define categorical features
categorical_features = ["seller_type", "layout_type", "property_type", "locality", "furnish_type", "bathroom"]

# Initialize OneHotEncoder
one_hot = OneHotEncoder()

# Initialize ColumnTransformer
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Fit and transform the combined data
transformed_combined_data = transformer.fit_transform(combined_data)

# Split the transformed data back into train and test sets
transformed_train_data = transformed_combined_data[:len(train_data_filled)]
transformed_test_data = transformed_combined_data[len(train_data_filled):]

# Print the transformed data for reference
print("Transformed Train Data:")
print(transformed_train_data)

print("\nTransformed Test Data:")
print(transformed_test_data)


Transformed Train Data:
  (0, 0)	1.0
  (0, 4)	1.0
  (0, 8)	1.0
  (0, 95)	1.0
  (0, 679)	1.0
  (0, 681)	1.0
  (0, 712)	500.0
  (0, 713)	1.0
  (1, 0)	1.0
  (1, 4)	1.0
  (1, 7)	1.0
  (1, 627)	1.0
  (1, 677)	1.0
  (1, 688)	1.0
  (1, 712)	581.0
  (1, 713)	1.0
  (2, 0)	1.0
  (2, 4)	1.0
  (2, 8)	1.0
  (2, 302)	1.0
  (2, 677)	1.0
  (2, 681)	1.0
  (2, 712)	500.0
  (2, 713)	1.0
  (3, 0)	1.0
  :	:
  (29696, 713)	4.0
  (29697, 0)	1.0
  (29697, 4)	1.0
  (29697, 7)	1.0
  (29697, 471)	1.0
  (29697, 678)	1.0
  (29697, 688)	1.0
  (29697, 712)	1500.0
  (29697, 713)	3.0
  (29698, 0)	1.0
  (29698, 4)	1.0
  (29698, 8)	1.0
  (29698, 630)	1.0
  (29698, 678)	1.0
  (29698, 696)	1.0
  (29698, 712)	5400.0
  (29698, 713)	4.0
  (29699, 0)	1.0
  (29699, 4)	1.0
  (29699, 9)	1.0
  (29699, 460)	1.0
  (29699, 678)	1.0
  (29699, 688)	1.0
  (29699, 712)	1150.0
  (29699, 713)	2.0

Transformed Test Data:
  (0, 0)	1.0
  (0, 4)	1.0
  (0, 8)	1.0
  (0, 453)	1.0
  (0, 678)	1.0
  (0, 692)	1.0
  (0, 712)	1900.0
  (0, 713)	3.0
  (

In [22]:
np.random.seed(42)

# Splitting the transformed_train_data to analyse the accuracy of the model
x_train, x_test, y_train, y_test = train_test_split(transformed_train_data, y, test_size=0.2)

# TRAINING THE MODEL
model.fit(x_train, y_train)

# score of the model
model.score(x_test, y_test)

0.9020297058404751

In [23]:
# Make predictions
predictions = model.predict(transformed_test_data)

# Create a DataFrame with the predictions
output_df = pd.DataFrame({'Predictions': predictions})

# Save the DataFrame to a CSV file
output_df.to_csv('predictions.csv', index=False)