In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [3]:
df = pd.read_csv('data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8136 entries, 0 to 8135
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          8136 non-null   object
 1   district      8136 non-null   object
 2   neighborhood  8136 non-null   object
 3   room          8136 non-null   int64 
 4   livingRoom    8136 non-null   int64 
 5   area          8136 non-null   int64 
 6   age           8136 non-null   int64 
 7   floor         8136 non-null   int64 
 8   price         8136 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 572.2+ KB


In [5]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['livingRoom'] = df['livingRoom'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8136 entries, 0 to 8135
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          8136 non-null   category
 1   district      8136 non-null   category
 2   neighborhood  8136 non-null   category
 3   room          8136 non-null   int32   
 4   livingRoom    8136 non-null   int32   
 5   area          8136 non-null   int32   
 6   age           8136 non-null   int32   
 7   floor         8136 non-null   int32   
 8   price         8136 non-null   int32   
dtypes: category(3), int32(6)
memory usage: 247.6 KB


In [7]:
columns=df.select_dtypes(include=[np.number]).columns
minValues = []
maxValues = []
for column in columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    minValue = Q1 - 1.5 * IQR
    maxValue = Q3 + 1.5 * IQR
    minValues.append(minValue)
    maxValues.append(maxValue)
    print(f"Column:{column}, min: {minValue}, max: {maxValue}")

Column:room, min: 0.5, max: 4.5
Column:livingRoom, min: 1.0, max: 1.0
Column:area, min: -17.5, max: 242.5
Column:age, min: -20.0, max: 44.0
Column:floor, min: -2.0, max: 6.0
Column:price, min: -18000.0, max: 62000.0


In [8]:
#cleaning outliers

In [10]:
for i,column in enumerate(columns):
    df = df[(df[column] >= minValues[i]) & (df[column] <= maxValues[i])]

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6212 entries, 0 to 8135
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6212 non-null   category
 1   district      6212 non-null   category
 2   neighborhood  6212 non-null   category
 3   room          6212 non-null   int32   
 4   livingRoom    6212 non-null   int32   
 5   area          6212 non-null   int32   
 6   age           6212 non-null   int32   
 7   floor         6212 non-null   int32   
 8   price         6212 non-null   int32   
dtypes: category(3), int32(6)
memory usage: 243.3 KB


In [13]:
df.describe()

Unnamed: 0,room,livingRoom,area,age,floor,price
count,6212.0,6212.0,6212.0,6212.0,6212.0,6212.0
mean,2.176272,1.0,104.66935,12.653896,2.192692,17900.975853
std,0.826815,0.0,39.442494,10.451565,1.601317,10467.582893
min,1.0,1.0,5.0,0.0,-2.0,1.0
25%,2.0,1.0,75.0,4.0,1.0,11000.0
50%,2.0,1.0,100.0,10.0,2.0,15000.0
75%,3.0,1.0,130.0,20.0,3.0,21000.0
max,4.0,1.0,240.0,44.0,6.0,60000.0


In [14]:
df = df[df['price'] >= 3000]

In [16]:
df.describe()

Unnamed: 0,room,livingRoom,area,age,floor,price
count,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0
mean,2.18002,1.0,104.830445,12.698169,2.191465,18170.733976
std,0.826463,0.0,39.467687,10.465384,1.601042,10323.22915
min,1.0,1.0,5.0,0.0,-2.0,3000.0
25%,2.0,1.0,75.0,4.0,1.0,11500.0
50%,2.0,1.0,100.0,10.0,2.0,15000.0
75%,3.0,1.0,130.0,20.0,3.0,21000.0
max,4.0,1.0,240.0,44.0,6.0,60000.0


In [17]:
df.to_csv('data_cleaned.csv', index=False)