In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('ap_data.csv')
df.head()

In [None]:
df.isna().sum()

In [None]:
region_mean = df['region'].mean()
df['region'].fillna(region_mean, inplace=True)

In [None]:
ceiling_mean = round(df['ceiling'].mean(), 2)
df['ceiling'].fillna(ceiling_mean, inplace=True)

In [None]:
print(df.shape)
df.isna().sum()

In [None]:
df.dropna(inplace=True)
print(df.shape)
df.isna().sum()

In [None]:
df.columns

In [None]:
columns_to_encode = ['flat_toilet', 'flat_balcony', 'flat_balcony_g', 'flat_door', 'inet_type', 'flat_parking', 'live_furniture', 'flat_flooring']
df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)
df

In [None]:
df = df.select_dtypes(exclude='object')
df

In [None]:
df['log_quadrature'] = np.log1p(df['quadrature'])
df['log_region'] = np.log1p(df['region'])
df['log_ceiling'] = np.log1p(df['ceiling'])
df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numerical_columns = df.columns

for x_column in numerical_columns:
    if x_column != 'price':
        plt.figure(figsize=(4, 3))
        plt.scatter(df[x_column], df['price'])
        plt.xlabel(x_column)
        plt.ylabel('Y')
        plt.title(f'Scatter Plot of {x_column} vs. Price')
        plt.show()

In [None]:
numerical_columns = ['price', 'room_count', 'quadrature', 'floor', 'region', 'year', 'ceiling']
# For each column, drop the outliers
for column in numerical_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter the dataframe
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Once outliers are removed, visualize your data
for x_column in numerical_columns:
    if x_column != 'price':
        plt.figure(figsize=(4, 3))
        plt.scatter(df[x_column], df['price'])
        plt.xlabel(x_column)
        plt.ylabel('Y')
        plt.title(f'Scatter Plot of {x_column} vs. Price')
        plt.show()


In [None]:
df.shape

In [None]:
# Assuming you have a DataFrame called 'df'
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Create a heatmap
plt.figure(figsize=(20, 15))  # Adjust the figure size if needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

# Set plot title
plt.title("Correlation Matrix Heatmap")

# Show the heatmap
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
features_to_normalize = ['room_count', 'quadrature', 'year', 'ceiling']
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

In [None]:
from sklearn.preprocessing import MinMaxScaler
import random

def get_user_input():
    # Get input from user
    user_data = {}
    user_data['room_count'] = float(input("Enter room count: "))
    user_data['quadrature'] = float(input("Enter quadrature: "))
    user_data['floor'] = float(input("Enter floor coeffcoefficient: "))
    user_data['region'] = float(input("Enter region coefficient: "))
    user_data['year'] = float(input("Enter year: "))
    user_data['ceiling'] = float(input("Enter ceiling height (e.g., 2.7 for 2.7m): "))

    options_mapping = {
        'flat_toilet': ['no_info', 'нет', 'раздельный', 'совмещенный'],
        'flat_balcony': ['балкон', 'балкон и лоджия', 'лоджия', 'несколько балконов или лоджий'],
        'flat_balcony_g': ['да', 'нет'],
        'flat_door': ['бронированная', 'деревянная', 'металлическая'],
        'inet_type': ['no_info', 'оптика', 'проводной', 'через TV кабель'],
        'flat_parking': ['гараж', 'паркинг', 'рядом охраняемая стоянка'],
        'live_furniture': ['без мебели', 'полностью', 'частично'],
        'flat_flooring': ['дерево', 'ковролан', 'ламинат', 'линолеум', 'паркет', 'плитка', 'пробковый']
    }

    for prefix, options in options_mapping.items():
        chosen_option = random.choice(options)
        values = [1 if opt == chosen_option else 0 for opt in options]
        cols = [str(prefix)+'_'+ str(opt) for opt in options]
        # col = "'" + "', '".join(map(str, cols))+"'"
        for i in range(len(values)):
            user_data[cols[i]] = values[i]  # Fill missing columns with default or most common value
            
    # Create a DataFrame with the correct data types
    user_input_df = pd.DataFrame(np.zeros((1, len(X_train.columns))), columns=X_train.columns)
    user_input_df = user_input_df.astype(float)  # Ensure all columns are float64 type
    
    # Update the DataFrame with the user's input
    for feature in user_data:
        user_input_df.at[0, feature] = user_data[feature]
    
    # Normalize and standardize the user's input
    user_input_df[features_to_normalize] = scaler.transform(user_input_df[features_to_normalize])
    user_input_df = stdScaler.transform(user_input_df)
    
    return user_input_df
    
def predict_price(user_input_df):
        
    # Predict price
    price_pred = model.predict(user_input_df)
    return price_pred[0]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

stdScaler = StandardScaler()

# Split the data into training and testing sets
X = df.drop('price', axis=1)
# print(X.columns)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = stdScaler.fit_transform(X_train)
X_test_scaled = stdScaler.transform(X_test)

# Define the models
models = {
    "Linear Regression": LinearRegression(),
    # "Ridge": Ridge(),
    # "Lasso": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42),
    # "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    # "Extra Trees": ExtraTreesRegressor(random_state=42),
    # "KNN": KNeighborsRegressor(n_neighbors=13),
    # "Support Vector Regression": SVR(),
    # "Decision Tree": DecisionTreeRegressor(random_state=42),
    # "Neural Network": MLPRegressor(batch_size=32, learning_rate_init=0.01, max_iter=10000, random_state=42)
}

# Train and evaluate each model

user_input_df = get_user_input()

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print()
    # print(f"{name}: Mean Squared Error: {mse}")
    print(f"{name}: Root Mean Squared Error: {rmse}")
    print(f"{name}: R² Score: {r2}")
    print()
    
    predicted_price = predict_price(user_input_df)
    print(f"Predicted Apartment Price: {predicted_price}")
    
        