<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/House_price_predict_in_India.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, log_loss, classification_report, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor


from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor


In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
plt.figure(figsize = (10,8))
sns.countplot(data = df, x = 'POSTED_BY')

In [None]:
for column in df:
  uniq_val = np.unique(df[column])
  np_val = len(uniq_val)
  if np_val < 10:
    print("The number of values for feature {} is: {} -- {}".format(column, np_val, uniq_val))
  else:
    print("The number of values for feature {} is: {}".format(column, np_val)) 

In [None]:
df['CITY'] = df['ADDRESS'].str.split(",").str[-1]

In [None]:
df = df.drop('ADDRESS', axis = 1)

In [None]:
df.head()

In [None]:
plt.figure(figsize = (10,10))
sns.distplot(df['TARGET(PRICE_IN_LACS)'])

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(df['TARGET(PRICE_IN_LACS)'], bins = 10)

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(df['SQUARE_FT'], bins = 20)

In [None]:
df['SQUARE_FT'] = np.log(df['SQUARE_FT'])
df['TARGET(PRICE_IN_LACS)'] = np.log(df['TARGET(PRICE_IN_LACS)'] )


In [None]:
plt.figure(figsize = (10,10))
sns.histplot(df['LATITUDE'], bins = 20)

In [None]:
df['LONGITUDE'] = np.log(df['LONGITUDE'] )
df['LATITUDE'] = np.log(df['LATITUDE'] )

In [None]:
df.head()

In [None]:
plt.figure(figsize = (12,10))
sns.countplot(data = df, x = 'BHK_OR_RK')

In [None]:
def map_city(city):
    if city in ['Ahmedabad', 'Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Kolkata', 'Mumbai', 'Pune', 'Maharashtra']:
        return 'tier1'
    elif city in ['Agra', 'Ajmer', 'Aligarh', 'Amravati', 'Amritsar', 'Asansol', 'Aurangabad', 'Bareilly', 
                  'Belgaum', 'Bhavnagar', 'Bhiwandi', 'Bhopal', 'Bhubaneswar', 'Bikaner', 'Bilaspur', 'Bokaro Steel City', 
                  'Chandigarh', 'Coimbatore', 'Cuttack', 'Dehradun', 'Dhanbad', 'Bhilai', 'Durgapur', 'Dindigul', 'Erode', 
                  'Faridabad', 'Firozabad', 'Ghaziabad', 'Gorakhpur', 'Gulbarga', 'Guntur', 'Gwalior', 'Gurgaon', 'Guwahati', 
                  'Hamirpur', 'Hubli–Dharwad', 'Indore', 'Jabalpur', 'Jaipur', 'Jalandhar', 'Jammu', 'Jamnagar', 'Jamshedpur', 
                  'Jhansi', 'Jodhpur', 'Kakinada', 'Kannur', 'Kanpur', 'Karnal', 'Kochi', 'Kolhapur', 'Kollam', 'Kozhikode', 
                  'Kurnool', 'Ludhiana', 'Lucknow', 'Madurai', 'Malappuram', 'Mathura', 'Mangalore', 'Meerut', 'Moradabad', 
                  'Mysore', 'Nagpur', 'Nanded', 'Nashik', 'Nellore', 'Noida', 'Patna', 'Pondicherry', 'Purulia', 'Prayagraj', 
                  'Raipur', 'Rajkot', 'Rajahmundry', 'Ranchi', 'Rourkela', 'Ratlam', 'Salem', 'Sangli', 'Shimla', 'Siliguri', 
                  'Solapur', 'Srinagar', 'Surat', 'Thanjavur', 'Thiruvananthapuram', 'Thrissur', 'Tiruchirappalli', 'Tirunelveli', 
                  'Tiruvannamalai', 'Ujjain', 'Bijapur', 'Vadodara', 'Varanasi', 'Vasai-Virar City', 'Vijayawada', 'Visakhapatnam', 
                  'Vellore', 'Warangal']:
        return 'tier2'
    else:
        return 'tier3'
    
df['city_tier'] = df['CITY'].apply(map_city)

In [None]:
df.head()

In [None]:
df = df.drop('CITY', axis = 1)

In [None]:
features = ['POSTED_BY', 'BHK_OR_RK', 'city_tier']

In [None]:
new_df = pd.get_dummies(df, columns = features)

In [None]:
new_df.head()

In [None]:
new_df.dropna()

In [None]:
new_df.isna().sum()

In [None]:
new_df.dropna(subset=['LATITUDE'], how='all', inplace=True)

In [None]:
new_df.isna().sum()

In [None]:
new_df.dropna(subset=['LONGITUDE'], how='all', inplace=True)

In [None]:
new_df.info()

In [None]:
plt.figure(figsize = (10,10))
sns.scatterplot(data = df, x = 'TARGET(PRICE_IN_LACS)', y = 'SQUARE_FT')

In [None]:
X = new_df.drop('TARGET(PRICE_IN_LACS)', axis = 1)
y = new_df['TARGET(PRICE_IN_LACS)']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [None]:
X.head()

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
lr = LinearRegression()

lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

In [None]:
print(lr.score(X_train, y_train))

In [None]:
print(lr.score(X_valid, y_valid))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

preds = rf_model.predict(X_valid)

print('Random Forest: ', r2_score(y_valid, preds))

In [None]:
from xgboost import XGBRegressor

xgboost_model = XGBRegressor(n_estimators=1000, learning_rate=0.1, random_state=42)

xgboost_model.fit(X_train, y_train)

preds = xgboost_model.predict(X_valid)

print('XG Boost: ', r2_score(y_valid, preds))

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

preds = dt.predict(X_train)

r2_score(y_train, preds)