In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
import seaborn as sns

## Data Load: Load home prices into a dataframe

In [None]:
df = pd.read_csv("Entities.csv")
df.head()

### Let's get to know the dataframe

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.city.value_counts()

In [None]:
df.province_name.value_counts()

## Drop features that are not required to build our model

In [None]:
df2 = df.drop(["Unnamed: 0","property_id","location_id","page_url","province_name","latitude","longitude","date_added","agency","agent"], axis = 1)
df2.head()

## Data Cleaning : Handle NA values

In [None]:
df2.isnull().sum()

### Can any house have a price of 0 ?

In [None]:
df2 = df2[~(df2["price"] == 0)]

### Can any house have a area of 0 ?

In [None]:
df2 = df2[~(df2["Total_Area"] == 0)]

### We are researching the prices of houses for sale

In [None]:
df2 = df2[~(df2["purpose"] == "For Rent")]

## Feature Engineering

In [None]:
df3 = df2.copy()

In [None]:
df3.drop("purpose",axis=1,inplace=True)

In [None]:
df3.head()

### Let's add a label that shows the total number of rooms in the houses.

In [None]:
df3["total_rooms"] = df3["baths"] + df3["bedrooms"]

### Can any house have a 0 rooms ?

In [None]:
df3 = df3[~(df3["total_rooms"] == 0)]

### Let's add a label showing the price per square meter for each house.

In [None]:
df3["price_per_sqft"] = df3["price"] / df3["Total_Area"]

In [None]:
df3.head()

## Dimensionality Reduction

Any location having less than 100 data points should be tagged as "other" location.

In [None]:
location_stats = df3["location"].value_counts(ascending = False)
location_stats

In [None]:
location_stats.values.sum()

In [None]:
len(location_stats[location_stats>100])

In [None]:
len(location_stats)

In [None]:
len(location_stats[location_stats<=100])

In [None]:
location_stats_less_than_100 = location_stats[location_stats<=100]
location_stats_less_than_100

In [None]:
df3.location = df3.location.apply(lambda x: "other" if x in location_stats_less_than_100 else x)
len(df3.location.unique())

In [None]:
df3.head()

In [None]:
df3.shape

## Outlier Removal Using Standard Deviation and Mean

In [None]:
df3.price_per_sqft.describe()

### We should remove outliers per location using mean and one standard deviation

In [None]:
df3.groupby("location").mean()

In [None]:
def remove_pps_outliers(df) :
    df_out = pd.DataFrame()
    for key, subdf in df.groupby("location") :
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index = True)
    return df_out
df4 = remove_pps_outliers(df3)
df4.shape

In [None]:
df4.head()

In [None]:
def plot_scatter_chart(df,location) :
    less_than_10 = df[(df.location == location) & (df.total_rooms < 6)]
    more_than_10 = df[(df.location == location) & (df.total_rooms >= 6)]
    matplotlib.rcParams["figure.figsize"] = (15,10)
    plt.scatter(less_than_10.Total_Area, less_than_10.price, color = "blue", label = "less_than_6", s = 50)
    plt.scatter(more_than_10.Total_Area, more_than_10.price, marker = "+", color = "green", label = "more_than_6", s = 50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price")
    plt.title(location)
    plt.legend()
    
plot_scatter_chart(df4, "Multan Road")

In [None]:
def remove_room_outliers(df) :
    exclude_indices = np.array([])
    for location, location_df in df.groupby("location") :
        room_stats = {}
        for room, room_df in location_df.groupby("total_rooms") :
            room_stats[room] = {
                "mean" : np.mean(room_df.price_per_sqft),
                "std" : np.std(room_df.price_per_sqft),
                "count" : room_df.shape[0]
            }
        for room, room_df in location_df.groupby("total_rooms"):
            stats = room_stats.get(room - 1)
            if stats and stats["count"]> 5 :
                exclude_indices = np.append(exclude_indices, room_df[room_df.price_per_sqft<(stats["mean"])].index.values)
    return df.drop(exclude_indices, axis="index")
df5 = remove_room_outliers(df4)
df5.shape

In [None]:
plot_scatter_chart(df5, "Multan Road")

#### It can be seen that the blue and green dots are more separated from each other after the outliers are removed

## Outlier Removal Using Bathrooms Feature

In [None]:
df5.head()

In [None]:
plt.hist(df5.baths, rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

### It is unusual to have 5 more bathrooms than number of bedrooms in a home

In [None]:
df5[df5.baths>df5.bedrooms+5].head()

In [None]:
df6 = df5[df5.baths<df5.bedrooms+5]

In [None]:
df6.head()

In [None]:
my_list = ["price","baths","bedrooms","Total_Area"]

In [None]:
df7 = df6.copy()

In [None]:
"""for column in my_list :
    for location in df7["location"].unique() :
        selected_location = df7[df7["location"] == location]
        selected_column = selected_location[column]
        
        q1 = selected_column.quantile(0.25)
        q3 = selected_column.quantile(0.75)
        
        iqr = q3-q1
        
        minimum = q1 - (1.5*iqr)
        maximum = q3 + (1.5*iqr)
        
        print(column,location,minimum, maximum)
        
        max_index = df7[(df7["location"] == location) & (df7[column] > maximum)].index
        min_index = df7[(df7["location"] == location) & (df7[column] < minimum)].index
        
        df7.drop(index = max_index, inplace = True)
        df7.drop(index = min_index, inplace = True)"""

In [None]:
df7.shape

## Data Visualization

In [None]:
sns.boxplot(x = "city", y = "price_per_sqft",data = df7);

### categorical variable summaries

In [None]:
df7["city"].value_counts().plot.barh();

In [None]:
sns.barplot(x = "city", y = df7.city.index, data = df7);

In [None]:
sns.barplot(x = "city", y = "price", hue = "property_type", data = df7);

### continuous variable summaries

In [None]:
df_num = df7.select_dtypes(include = ["float64","int64"])

In [None]:
df_num.head()

In [None]:
df_num.describe().T

In [None]:
sns.kdeplot(df7.price_per_sqft, shade= True);

In [None]:
(sns
 .FacetGrid(df7,
              hue= "city",
              height = 5,
              xlim = (0,10000))
 .map(sns.kdeplot,"price_per_sqft",shade = True)
 .add_legend()
);

## Scatter

In [None]:
df7.head()

## Use One Hot Encoding For Location

In [None]:
df7.head()

In [None]:
df8 = df7.drop(["price_per_sqft","total_rooms"],axis = 1)
df8.head()

In [None]:
df9 = pd.get_dummies(df8)
df9.head()

In [None]:
X = df9.drop(["price"], axis = "columns")

In [None]:
y = df9.price

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test, y_test)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits = 5, test_size=0.2, random_state = 0)
    for algo_name, config in algos.items() :
        gs = GridSearchCV(config["model"], config["params"], cv = cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            "model" : algo_name,
            "best_score" : gs.best_score_,
            "best_params" : gs.best_params_
        })
        
    return pd.DataFrame(scores,columns=["model","best_score","best_params"])
find_best_model_using_gridsearchcv(X,y)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
import xgboost
from xgboost import XGBRegressor

In [None]:
def compML(df, y, alg) :
    # train test ayrımı
    y = df[y]
    X = df.drop(["price"], axis="columns")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
    # modelleme
    model = alg().fit(X_train, y_train)
    score = model.score(X_test, y_test)
    model_ismi = alg.__name__
    print(model_ismi,":", score)
    

In [None]:
models = [LGBMRegressor,
         XGBRegressor,
         GradientBoostingRegressor,
         RandomForestRegressor,
         DecisionTreeRegressor,
         MLPRegressor,
         KNeighborsRegressor,
         SVR]

In [None]:
for i in models :
    compML(df9, "price", i)