# Data Analysis project
Dataset : Australia Real Estate data

#### Step 1: Import Libraries and Load Data

In [1]:
#Import Dependencies
import pandas as pd
import numpy as np
import seaborn as sns

from scipy import stats
from matplotlib import pyplot as plt

In [2]:
#Load Australin Real Estate dataset to pandas
proprty_data = pd.read_csv("../Resources/aus_real_estate.csv")
proprty_data.head(10)

Unnamed: 0,Price,Bedrooms,Bathrooms,SqFt,City,State,Year_Built,Type,Garage,Lot_Area
0,982112,4,1,1561,Adelaide,QLD,2021,Townhouse,0,1357
1,571388,3,2,3735,Melbourne,VIC,1999,House,1,8397
2,866821,5,1,2032,Sydney,VIC,1976,Townhouse,0,3478
3,1230977,4,3,3861,Brisbane,SA,1978,House,0,7619
4,241787,4,2,3150,Perth,SA,1992,Apartment,0,8324
5,631871,4,2,1923,Brisbane,SA,2002,House,1,6638
6,550777,4,1,829,Adelaide,QLD,2022,Townhouse,0,2752
7,1509971,1,1,2447,Brisbane,QLD,1967,Apartment,1,2792
8,935088,4,2,3652,Melbourne,QLD,1985,House,0,1645
9,999053,1,1,3114,Sydney,NSW,1962,House,0,4195


#### Step 2: Data Exploration

In [3]:
# Basic information about the data
proprty_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Price       5000 non-null   int64 
 1   Bedrooms    5000 non-null   int64 
 2   Bathrooms   5000 non-null   int64 
 3   SqFt        5000 non-null   int64 
 4   City        5000 non-null   object
 5   State       5000 non-null   object
 6   Year_Built  5000 non-null   int64 
 7   Type        5000 non-null   object
 8   Garage      5000 non-null   int64 
 9   Lot_Area    5000 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 390.8+ KB


In [4]:
#check for missing values
proprty_data.isnull().sum()

Price         0
Bedrooms      0
Bathrooms     0
SqFt          0
City          0
State         0
Year_Built    0
Type          0
Garage        0
Lot_Area      0
dtype: int64

In [5]:
#Checking the state names listed under State column
proprty_data["State"].unique()

array(['QLD', 'VIC', 'SA', 'NSW', 'WA'], dtype=object)

In [6]:
#Checking the City names listed under City column
proprty_data["City"].unique()

array(['Adelaide', 'Melbourne', 'Sydney', 'Brisbane', 'Perth'],
      dtype=object)

In [7]:
#Checking the types of proprties listed under Type column
proprty_data["Type"].unique()

array(['Townhouse', 'House', 'Apartment'], dtype=object)

In [8]:
proprty_data["Bedrooms"].unique()

array([4, 3, 5, 1, 2], dtype=int64)

In [9]:
#Geting the unique years from data frame
year_list = proprty_data["Year_Built"].unique()
year_list

array([2021, 1999, 1976, 1978, 1992, 2002, 2022, 1967, 1985, 1962, 1972,
       1995, 1955, 1989, 1993, 1988, 2013, 1991, 1964, 1966, 2014, 2000,
       1984, 2016, 1970, 2012, 2019, 1982, 2015, 1980, 1975, 1958, 1950,
       1965, 2018, 1971, 2020, 1996, 2010, 1963, 1987, 2009, 2007, 1960,
       1994, 1997, 2003, 1977, 2011, 1953, 1981, 1951, 2004, 1954, 1952,
       1959, 1990, 1979, 2006, 1968, 2017, 1986, 1957, 1969, 2008, 2001,
       2005, 1973, 1961, 1974, 1983, 1956, 1998], dtype=int64)

In [10]:
#Sorting the years to see starting and ending year of the data
np.sort(year_list)

array([1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2022], dtype=int64)

In [11]:
proprty_data.isna().sum()

Price         0
Bedrooms      0
Bathrooms     0
SqFt          0
City          0
State         0
Year_Built    0
Type          0
Garage        0
Lot_Area      0
dtype: int64

In [12]:
#Summary Statistics
stats_overview = round(proprty_data.describe(),2)
stats_overview

Unnamed: 0,Price,Bedrooms,Bathrooms,SqFt,Year_Built,Garage,Lot_Area
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,1049955.48,2.99,1.97,2404.41,1985.83,0.49,5523.4
std,548340.31,1.42,0.82,923.95,21.17,0.5,2605.28
min,100028.0,1.0,1.0,800.0,1950.0,0.0,1001.0
25%,583459.5,2.0,1.0,1598.75,1967.0,0.0,3250.75
50%,1040834.0,3.0,2.0,2418.0,1985.0,0.0,5514.5
75%,1528157.75,4.0,3.0,3191.25,2004.0,1.0,7804.0
max,1999701.0,5.0,3.0,3998.0,2022.0,1.0,9998.0


In [13]:
print(proprty_data["City"].value_counts())

City
Adelaide     1040
Melbourne    1021
Brisbane     1007
Perth         988
Sydney        944
Name: count, dtype: int64


This dataset comprising 5000 records,accros10 distinc columns.
It has no missing values which is confired by performing .info() , .isnull().sum() , .isna().sum() functions.
The DataFrame is clean and ready for EDA.