-
Notifications
You must be signed in to change notification settings - Fork 0
/
NashVille(TN)_Housing
165 lines (107 loc) · 5.37 KB
/
NashVille(TN)_Housing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
-- Let's first have a look into the data.
SELECT TOP (100) *
FROM Housing_Analysis_Project..Nashville_housing;
-- Let's start the cleaning of the data by standardizing the date format from the current datetime format.
ALTER TABLE Nashville_housing
ADD SaleDateConverted Date;
UPDATE Nashville_housing
SET SaleDateConverted = CONVERT(date, SaleDate);
SELECT SaleDateConverted, CONVERT(date, SaleDate)
FROM Housing_Analysis_Project..Nashville_housing;
ALTER TABLE Nashville_housing
DROP COLUMN SaleDate;
/* I realized that the there was missing data in the PropertyAddress field. I then learned that, for the same parcel ID, property addresses were identical.
Therefore, I populated the PropertyAddress column by using the values where the null field had the same parcel ID as the PropertyAddress populated field*/
-- First, we join the Nashville-housing table to itself (self-join)
SELECT a.ParcelID,a.PropertyAddress,b.ParcelID, b.PropertyAddress, ISNULL(a.PropertyAddress, b.PropertyAddress) -- ISNULL does what I want here as it returns the value of b.PropertyAddress, if and only if a.PropertyAddress is null.
FROM Housing_Analysis_Project..Nashville_housing AS a
JOIN Housing_Analysis_Project..Nashville_housing AS b
ON a.ParcelID=b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL;
--There are 35 null values that I will fill out with the new PropertyAddress values.
UPDATE a
SET PropertyAddress = ISNULL(a.PropertyAddress, b.PropertyAddress)
FROM Housing_Analysis_Project..Nashville_housing AS a
JOIN Housing_Analysis_Project..Nashville_housing AS b
ON a.ParcelID=b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL;
--Let's check if it worked. No values returned, which indicates that it did!
SELECT a.ParcelID,a.PropertyAddress,b.ParcelID, b.PropertyAddress, ISNULL(a.PropertyAddress, b.PropertyAddress)
FROM Housing_Analysis_Project..Nashville_housing AS a
JOIN Housing_Analysis_Project..Nashville_housing AS b
ON a.ParcelID=b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL;
SELECT TOP(100) *
from Nashville_housing
-- Let's now split the Address into individual columns for Address and City.
SELECT SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress)-1) AS Address,
SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress)+1, LEN(PropertyAddress)) AS City
FROM Housing_Analysis_Project..Nashville_housing;
-- Let's add the split values into the table and drop the duplicate column.
ALTER TABLE Nashville_housing
ADD PropertySplitAddress nvarchar(255);
UPDATE Nashville_housing
SET PropertySplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress)-1);
ALTER TABLE Nashville_housing
ADD PropertySplitCity nvarchar(255);
UPDATE Nashville_housing
SET PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress)+1, LEN(PropertyAddress));
ALTER TABLE Nashville_housing
DROP COLUMN PropertyAddress;
SELECT TOP(5) *
FROM Nashville_housing;
-- Likewise, the same occurs with the OwnerAddress field. It needs split into Address, City, and State. Alternatively, I will use the PARSENAME function this time.
SELECT
PARSENAME(REPLACE(OwnerAddress, ',','.'),3) AS Address, --PARSENAME needs a dot delimited string, for which I will use the REPLACE function to replace commas with dots.
PARSENAME(REPLACE(OwnerAddress, ',','.'),2) AS City,
PARSENAME(REPLACE(OwnerAddress, ',','.'),1) AS State
FROM Housing_Analysis_Project..Nashville_housing;
-- Let's add the split values into the Table
ALTER TABLE Nashville_housing
ADD OwnerSplitAddress nvarchar(255);
UPDATE Nashville_housing
SET OwnerSplitAddress = PARSENAME(REPLACE(OwnerAddress, ',','.'),3);
ALTER TABLE Nashville_housing
ADD OwnerSplitCity nvarchar(255);
UPDATE Nashville_housing
SET OwnerSplitCity = PARSENAME(REPLACE(OwnerAddress, ',','.'),2);
ALTER TABLE Nashville_housing
ADD OwnerSplitState nvarchar(255);
UPDATE Nashville_housing
SET OwnerSplitState = PARSENAME(REPLACE(OwnerAddress, ',','.'),1);
ALTER TABLE Nashville_housing
DROP COLUMN OwnerAddress;
SELECT TOP(5) *
FROM Nashville_housing;
/*In the "Sold as Vacant" field, there are 4 distinct values (Y, N,Yes and No), being the last two the most frequent (verified with a count). Let's change Y and N to the more frequent Yes and No */
SELECT DISTINCT SoldAsVacant, COUNT(SoldAsVacant)
From Housing_Analysis_Project..Nashville_housing
GROUP BY SoldAsVacant
ORDER BY 2 DESC;
UPDATE Nashville_housing
SET SoldAsVacant = CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END
-- Verifying that SoldAsVacant only has two possible values, Yes or No.
SELECT DISTINCT SoldAsVacant
FROM Nashville_housing;
-- Let's now remove duplicates, using a CTE and the ROW_NUMBER function to identify the duplicated values.
WITH RowNumCTE AS(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY ParcelID, PropertySplitAddress, SalePrice, SaleDateConverted, LegalReference ORDER BY UniqueID) AS row_num
--The PARTITION BY statement includes all the fields I identified as unique, thus meaning that rows with the same values must be duplicates.
FROM Housing_Analysis_Project..Nashville_housing
)
SELECT *
FROM RowNumCTE
WHERE row_num > 1
ORDER BY PropertySplitAddress;
--Found 104 duplicates whererow_num > 1)- let's get rid of them.
DELETE
FROM RowNumCTE
WHERE row_num > 1;
--The Table has now been cleaned and is ready for any future further analysis.