# Seattle House Prices

This dataset contains house sale prices in Seattle. It includes homes sold between May 2014 and May 2015.

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import random
import string
import hashlib
import json


#Loading data
df = pd.read_csv('house_sales.csv', sep = ',')
df['date'] = df['date'].str.slice(stop = 16)
df['date'] = pd.to_datetime(df['date'], format = "%Y-%m-%d %H:%M")
df['week'] = df['date'].dt.isocalendar().week

df = df.drop(columns=['bedrooms','bathrooms','sqft_lot','floors','waterfront','zipcode','grade','view','sqft_above','sqft_basement','condition','yr_built','yr_renovated','sqft_living15','sqft_lot15'])

In [2]:
df

Unnamed: 0,id,date,price,sqft_living,lat,long,week
0,7129300520,2014-10-13,221900,1180,47.5112,-122.257,42
1,6414100192,2014-12-09,538000,2570,47.7210,-122.319,50
2,5631500400,2015-02-25,180000,770,47.7379,-122.233,9
3,2487200875,2014-12-09,604000,1960,47.5208,-122.393,50
4,1954400510,2015-02-18,510000,1680,47.6168,-122.045,8
...,...,...,...,...,...,...,...
21608,263000018,2014-05-21,360000,1530,47.6993,-122.346,21
21609,6600060120,2015-02-23,400000,2310,47.5107,-122.362,9
21610,1523300141,2014-06-23,402101,1020,47.5944,-122.299,26
21611,291310100,2015-01-16,400000,1600,47.5345,-122.069,3


In [3]:
# Save file
df_drop_week = df.drop('week', axis=1)
df_drop_week.to_csv('original_1.csv', sep = '\t', index=False, header=False, columns=None)

In [4]:
df_drop_week

Unnamed: 0,id,date,price,sqft_living,lat,long
0,7129300520,2014-10-13,221900,1180,47.5112,-122.257
1,6414100192,2014-12-09,538000,2570,47.7210,-122.319
2,5631500400,2015-02-25,180000,770,47.7379,-122.233
3,2487200875,2014-12-09,604000,1960,47.5208,-122.393
4,1954400510,2015-02-18,510000,1680,47.6168,-122.045
...,...,...,...,...,...,...
21608,263000018,2014-05-21,360000,1530,47.6993,-122.346
21609,6600060120,2015-02-23,400000,2310,47.5107,-122.362
21610,1523300141,2014-06-23,402101,1020,47.5944,-122.299
21611,291310100,2015-01-16,400000,1600,47.5345,-122.069


In [5]:
df['price'].describe()

count    2.161300e+04
mean     5.400881e+05
std      3.671272e+05
min      7.500000e+04
25%      3.219500e+05
50%      4.500000e+05
75%      6.450000e+05
max      7.700000e+06
Name: price, dtype: float64

In [6]:
df['sqft_living'].describe()

count    21613.000000
mean      2079.899736
std        918.440897
min        290.000000
25%       1427.000000
50%       1910.000000
75%       2550.000000
max      13540.000000
Name: sqft_living, dtype: float64

In [7]:
df.isnull().sum()

id             0
date           0
price          0
sqft_living    0
lat            0
long           0
week           0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
#Change lat-long
df['lat'] = df['lat'] + 0.007 * np.cos(df['lat'])
df['long'] = df['long'] + 0.007 * np.sin(df['long'])

In [10]:
df['lat'] = df['lat'].round(decimals=3)
df['long'] = df['long'].round(decimals=3)

In [11]:
df

Unnamed: 0,id,date,price,sqft_living,lat,long,week
0,7129300520,2014-10-13,221900,1180,47.505,-122.259,42
1,6414100192,2014-12-09,538000,2570,47.715,-122.320,50
2,5631500400,2015-02-25,180000,770,47.732,-122.235,9
3,2487200875,2014-12-09,604000,1960,47.514,-122.394,50
4,1954400510,2015-02-18,510000,1680,47.611,-122.048,8
...,...,...,...,...,...,...,...
21608,263000018,2014-05-21,360000,1530,47.693,-122.347,21
21609,6600060120,2015-02-23,400000,2310,47.504,-122.363,9
21610,1523300141,2014-06-23,402101,1020,47.588,-122.301,26
21611,291310100,2015-01-16,400000,1600,47.528,-122.072,3


In [12]:
#Change date to random (same week)
def change_date(date):
    rand_day = int(np.random.choice([-2,-1,0,1,2], p=[0.03, 0.03, 0.88, 0.03, 0.03]))
    return date - dt.timedelta(days=rand_day) if date.isocalendar()[1] == (date - dt.timedelta(days=rand_day)).isocalendar()[1] else date + dt.timedelta(days=rand_day)

In [13]:
#Apply change_date
df['date'] = df['date'].apply(change_date) 

In [14]:
df

Unnamed: 0,id,date,price,sqft_living,lat,long,week
0,7129300520,2014-10-13,221900,1180,47.505,-122.259,42
1,6414100192,2014-12-09,538000,2570,47.715,-122.320,50
2,5631500400,2015-02-25,180000,770,47.732,-122.235,9
3,2487200875,2014-12-09,604000,1960,47.514,-122.394,50
4,1954400510,2015-02-18,510000,1680,47.611,-122.048,8
...,...,...,...,...,...,...,...
21608,263000018,2014-05-22,360000,1530,47.693,-122.347,21
21609,6600060120,2015-02-23,400000,2310,47.504,-122.363,9
21610,1523300141,2014-06-23,402101,1020,47.588,-122.301,26
21611,291310100,2015-01-16,400000,1600,47.528,-122.072,3


In [15]:
#Change price to random range, for example : abcdef into random in range (A0000, (A+1)0000)
def change_price(price):
    prefix = int(str(price)[:-4])
    lower_bound = prefix * 10000
    upper_bound = (prefix + 1) * 10000
    rand_price = random.randint(lower_bound, upper_bound)
    return rand_price

In [16]:
#Apply change_price
df['price'] = df['price'].apply(change_price) 

In [17]:
df

Unnamed: 0,id,date,price,sqft_living,lat,long,week
0,7129300520,2014-10-13,224702,1180,47.505,-122.259,42
1,6414100192,2014-12-09,530205,2570,47.715,-122.320,50
2,5631500400,2015-02-25,189251,770,47.732,-122.235,9
3,2487200875,2014-12-09,609189,1960,47.514,-122.394,50
4,1954400510,2015-02-18,515658,1680,47.611,-122.048,8
...,...,...,...,...,...,...,...
21608,263000018,2014-05-22,367598,1530,47.693,-122.347,21
21609,6600060120,2015-02-23,408071,2310,47.504,-122.363,9
21610,1523300141,2014-06-23,408973,1020,47.588,-122.301,26
21611,291310100,2015-01-16,406335,1600,47.528,-122.072,3


In [18]:
#Change square foot of living
from itertools import permutations

def change_sqft(sqft_living):
    try:
        # Extract the first part and the last three digits
        first_part = int(str(sqft_living)[:-3]) * 1000
        last_part = int(str(sqft_living)[-3:])
        
        # Ensure that the last part always has three digits
        last_part_str = '{:03}'.format(last_part)
        
        # Convert the last part to a list
        last_part_list = list(last_part_str)
        
        # Shuffle the list to get a random permutation
        random.shuffle(last_part_list)
        
        # Combine the first part and the shuffled last part
        random_sqft = first_part + int(''.join(last_part_list))
        
        return random_sqft
    except ValueError:
        return sqft_living
    return random_sqft

In [19]:
#Apply change_sqft
df['sqft_living'] = df['sqft_living'].apply(change_sqft) 

In [20]:
df

Unnamed: 0,id,date,price,sqft_living,lat,long,week
0,7129300520,2014-10-13,224702,1108,47.505,-122.259,42
1,6414100192,2014-12-09,530205,2750,47.715,-122.320,50
2,5631500400,2015-02-25,189251,770,47.732,-122.235,9
3,2487200875,2014-12-09,609189,1690,47.514,-122.394,50
4,1954400510,2015-02-18,515658,1068,47.611,-122.048,8
...,...,...,...,...,...,...,...
21608,263000018,2014-05-22,367598,1053,47.693,-122.347,21
21609,6600060120,2015-02-23,408071,2310,47.504,-122.363,9
21610,1523300141,2014-06-23,408973,1200,47.588,-122.301,26
21611,291310100,2015-01-16,406335,1006,47.528,-122.072,3


In [21]:
#Pseudo id (random id ---> hash)
df['id'] =  str(random.randrange(2000, 2999)) + df['id'].astype(str) + df['week'].astype(str)
df['id'] = df['id'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())

In [22]:
df

Unnamed: 0,id,date,price,sqft_living,lat,long,week
0,3021a63e0fc64e1ec0ea06bf31e9f09ec2387e3d01ff07...,2014-10-13,224702,1108,47.505,-122.259,42
1,8cc8d4be94249bc338d0040f2f61a73b086e23008c1962...,2014-12-09,530205,2750,47.715,-122.320,50
2,24c06cfb17652ae6f8560712e275b9fcd06d6908a0d3a4...,2015-02-25,189251,770,47.732,-122.235,9
3,da3b391120372d51f52d7e15f321a7af3a62f0b3e1b868...,2014-12-09,609189,1690,47.514,-122.394,50
4,f6fee08ad9df52e577afe0cd7e93fcb408214034dc0cf6...,2015-02-18,515658,1068,47.611,-122.048,8
...,...,...,...,...,...,...,...
21608,b2915cad3640f247ffe5631999fa92cc52caae442c6e52...,2014-05-22,367598,1053,47.693,-122.347,21
21609,9c845ec8bc72c26b2918d9beb593d224c59b1c1d3d0159...,2015-02-23,408071,2310,47.504,-122.363,9
21610,39012357e6f5fd1f52a60d4a2257a8827317bb05c8c97e...,2014-06-23,408973,1200,47.588,-122.301,26
21611,3c2b516fa7a4e79c4681e364a9299c2ee23762eb1840f7...,2015-01-16,406335,1006,47.528,-122.072,3


In [23]:
df = df.drop('week', axis=1)

In [24]:
# Shuffle the rows to make sure the two consecutive rows do not have the same id and without changing the indice.
df = df.sample(frac=1).reset_index(drop=True)

In [25]:
df

Unnamed: 0,id,date,price,sqft_living,lat,long
0,ae0a9fbf5fbe0c062d2c0faf5dd7c3d0f9172704c22fc9...,2015-01-12,233596,920,47.706,-122.286
1,1267035ea229b5dbbc2d9c1ea9601d333b874a67380197...,2014-06-24,358420,740,47.518,-122.372
2,92ea5ab10bcd60594f33500d35569a59fd0f00a0aa1d1d...,2014-06-11,247523,1360,47.458,-121.762
3,d78c5e08e0e2e2c66788a6fa7cad221088842b47710fd5...,2015-01-21,417746,1094,47.706,-122.283
4,2d687c62ce4c3ceb6f33ab5e46b8ec4ce6d12bf93d4553...,2015-04-24,456050,1802,47.606,-122.303
...,...,...,...,...,...,...
21608,adf5b560da913b3360fe0e6d115ec7eec812b67b11e52f...,2014-06-28,1602915,2802,47.624,-122.292
21609,6595aaf63bcbb2a935803159675efe7a77fb10b2e95c3e...,2014-12-08,467327,1810,47.730,-122.079
21610,70dc3670fcc8d8ddc4a908a22142ba260636a939c519d4...,2014-10-21,114307,670,47.353,-122.059
21611,564a773993169a6dd6cb1f037151b580b3e497fb3cc8b5...,2014-08-27,950082,2025,47.623,-122.310


In [26]:
# Save file
df.to_csv('self_anonymisation_1.csv', sep = '\t', index=False, header=False, columns=None)