# Title : Data Wrangling on Real Estate Market

In [None]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
# from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("datasets/Bengaluru_House_Data.csv") 

In [None]:
df.head()

In [None]:
df.shape 

In [None]:
df.columns

In [None]:
df['area_type']

In [None]:
df['area_type'].unique()

In [None]:
df['area_type'].value_counts()

In [None]:
df1 = df.drop(['area_type','society','balcony','availability'],axis='columns') 

In [None]:
df1.shape

In [None]:
df1.isnull().sum()

In [None]:
df1.shape 

In [None]:
df2 = df1.dropna() 
df2.isnull().sum()

In [None]:
df2.shape 

In [None]:
df2['size'].unique()

In [None]:
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))

In [None]:
df2.head()

In [None]:
df2.bhk.unique()

In [None]:
df2[df2.bhk>20]

In [None]:
df2.total_sqft.unique()

In [None]:
def is_float(x):
    try:
        float(x) 
        return True
    except(ValueError, TypeError):
        return False 

In [None]:
df2[~df2['total_sqft'].apply(is_float)].head(10)

In [None]:
def convert_sqft_to_num(x): 
    tokens = x.split('-')
    if len(tokens) == 2:
        try:
            return (float(tokens[0])+float(tokens[1]))/2
        except ValueError:
            return None
    try:
        return float(x) 
    except ValueError:
        return None 
    
result = convert_sqft_to_num('2100 - 2850')
print(result)

In [None]:
convert_sqft_to_num('34.46Sq. Meter') 
df3 = df2.copy()
df3.total_sqft = df3.total_sqft.apply(convert_sqft_to_num) 
df3

In [None]:
df3 = df3[df3.total_sqft.notnull()] 
df3

In [None]:
df3.loc[30]

In [None]:
df4 = df3.copy()
df4['price_per_sqft'] = df4['price']*100000/df4['total_sqft'] 
df4.head()

In [None]:
df4_stats = df4['price_per_sqft'].describe() 
df4_stats

In [None]:
df4.to_csv("./datasets/bhp.csv",index=False)

In [None]:
df4.location = df4.location.apply(lambda x: x.strip()) 
location_stats = df4['location'].value_counts(ascending=False) 
location_stats

In [None]:
len(location_stats[location_stats>10])

In [None]:
len(location_stats) 

In [None]:
len(location_stats[location_stats<=10]) 

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10] 
location_stats_less_than_10

In [None]:
len(df4.location.unique())

In [None]:
df4.location = df4.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x) 
len(df4.location.unique())

In [None]:
df4.head(10)

In [None]:
df4[df4.total_sqft/df4.bhk<300].head()

In [None]:
df4.shape 

In [None]:
df5 = df4[~(df4.total_sqft/df4.bhk<300)] 
df5.shape

In [None]:
df5.columns

In [None]:
def plot_boxplot(df, column):
    plt.boxplot(df[column])
    plt.title(f"{column}")
    plt.show()

In [None]:
def remove_outliers(df, column):
    Q1 = np.percentile(df[column], 25.)
    Q3 = np.percentile(df[column], 75.)
    IQR = Q3 - Q1
    ll = Q1 - (1.5 * IQR)
    ul = Q3 + (1.5 * IQR)
    bad_indices = df[(df[column] < ll) | (df[column] > ul)].index.tolist()
    df_out= df.drop(bad_indices)
    print("Before")
    plot_boxplot(df, column)
    print("After")
    plot_boxplot(df_out, column)
    return df

In [None]:
columns_to_check = ['total_sqft', 'bath', 'price', 'bhk', 'price_per_sqft']
for column in columns_to_check:
    remove_outliers(df5, column)
