In [57]:
from utils import get_data
import numpy
import pandas as pd

In [58]:
data = get_data('../data')

# Define Criteria

Find home that have been sold, and remove condos since they the units may have the same address, and trying to match on address. Note this means we will **not** be working with any condo data in this project, since it is hard to identify flipped condos.

We then match to find houses with the same street address.

In [59]:
soldHomes = data[data['STATUS'] == 'SLD']
soldHomes = soldHomes[soldHomes['PROPTYPE'] != 'CC']

In [60]:
soldMLS = soldHomes['MLSNUM'].unique()

In [61]:
# find home with same address
def getHome(data, mls):
    home = soldHomes[soldHomes['MLSNUM'] == mls]
    homes = soldHomes[soldHomes['ZIP'] == home['ZIP'].iloc[0]]
    homes = homes[homes['HOUSENUM1'] == home['HOUSENUM1'].iloc[0]]
    homes = homes[homes['STREETNAME'] == home['STREETNAME'].iloc[0]]
    return homes

# Find Potential Flips

Go through all mls listing to find matching homes.

Remaining homes (ones that are not flips) will end up on the control/non-flippable list.

In [62]:
test_homes = []
control_homes = []

for mls in soldMLS:
    homes = getHome(soldHomes, mls)
    if(homes.shape[0] == 2):
        test_homes.append(homes)
    else:
        control_homes.append(homes)
        

In [63]:
flippedHomes = pd.concat(test_homes)
print(flippedHomes.shape)

controlHomes = pd.concat(control_homes)
print(controlHomes.shape)

(9770, 39)
(126082, 39)


In [64]:
# remove duplicates and sort by mls
flippedHomes = flippedHomes.drop_duplicates(subset='MLSNUM', keep="first")
flippedHomes = flippedHomes.sort_values(by='MLSNUM')

controlHomes = controlHomes.drop_duplicates(subset='MLSNUM', keep="first")

# Build Test and Control Data Sets

Create the control data set from our array of control homes. Then, put all the pre-flipped homes in as "flippable", and the post-flipped homes back int othe control pile (as they are no longer flippable).

In [65]:
# build flip set
flip_data = [];
post_flips = [];
for mls in flippedHomes['MLSNUM']:
    homes = getHome(flippedHomes, mls)
    # check if home has 2 mls rows
    if(homes.shape[0] >= 2):
        pre = homes.iloc[0]
        post = homes.iloc[1]
        # check if it's not the same mls number
        if pre['MLSNUM'] == post['MLSNUM']:
            continue
        # find pre and post flip values
        if pd.to_datetime(pre['SOLDDATE']) > pd.to_datetime(post['SOLDDATE']):
            # post < pre: swap them
            swap = pre
            pre = post
            post = swap
        flip_data.append({
            'FLIPPABLE': "1",
            'MLSNUM': pre['MLSNUM'],
            'SOLDPRICE': pre['SOLDPRICE'],
            'DOM': pre['DOM'], #days on market
            'ZIP': pre['ZIP'],
            'BEDS': pre['BEDS'],
            'BATHS': pre['BATHS'],
            'SQFT': pre['SQFT'],
            'AGE': pre['AGE'],
            'LOTSIZE': pre['LOTSIZE'],
            'GARAGE': pre['GARAGE'],
        })
        post_flips.append({ # we want to add our post-flip data to the control, since these are not flippable anymore
            'FLIPPABLE': "0",
            'MLSNUM': post['MLSNUM'],
            'SOLDPRICE': post['SOLDPRICE'],
            'DOM': post['DOM'],
            'ZIP': post['ZIP'],
            'BEDS': post['BEDS'],
            'BATHS': post['BATHS'],
            'SQFT': post['SQFT'],
            'AGE': post['AGE'],
            'LOTSIZE': post['LOTSIZE'],
            'GARAGE': post['GARAGE'],
        })

In [66]:
# build control set
post_flips = pd.DataFrame.from_dict(post_flips)
print(post_flips.shape)

unflipped_homes = controlHomes[['MLSNUM','SOLDPRICE','DOM','ZIP','BEDS','BATHS','SQFT','AGE','LOTSIZE','GARAGE']]
unflipped_homes.insert(0, 'FLIPPABLE', 0)
control_data = pd.concat([unflipped_homes, post_flips], ignore_index=True)

print(control_data.shape)

(3788, 11)
(129598, 11)


# Combining Test and Control Data

Clean and combine the two data frames. Know we will have an unbalanced data set, and will need to account for that in our analysis.

In [67]:
flip_df = pd.DataFrame.from_dict(flip_data)
control_df = pd.DataFrame.from_dict(control_data)

In [68]:
flip_df.head(20)
print(flip_df.shape)
control_df.head(20)
print(control_df.shape)

(3788, 11)
(129598, 11)


In [69]:
# combine test and control
merged_test_and_control_data = pd.concat([flip_df, control_df], ignore_index=True)
print(merged_test_and_control_data.shape)

# remove bad listings for now
bad_data = ['72250832','71902243','72214658','72099376','72032454','72027853','72018311','71955378','72045937','72133139','72144618']
merged_test_and_control_data = merged_test_and_control_data[~merged_test_and_control_data['MLSNUM'].isin(bad_data)]

# clean up data that should be ints
merged_test_and_control_data[["MLSNUM", "DOM", "GARAGE"]] = merged_test_and_control_data[["MLSNUM", "DOM", "GARAGE"]].astype(int)


# verify
print(merged_test_and_control_data.shape)
merged_test_and_control_data.head(20)

(133386, 11)
(133375, 11)


Unnamed: 0,FLIPPABLE,MLSNUM,SOLDPRICE,DOM,ZIP,BEDS,BATHS,SQFT,AGE,LOTSIZE,GARAGE
0,1,71425748,50000.0,1077,2740,4,2.0,1984,95,1888,0
1,1,71477127,125000.0,724,2368,4,2.0,2333,67,10000,0
2,1,71478495,720500.0,828,1532,4,3.5,4196,20,117176,3
3,1,71485790,160000.0,1140,1746,3,1.5,1350,58,17860,0
4,1,71614526,335000.0,809,2649,3,2.0,1270,38,5401,1
5,1,71633454,364000.0,106,2127,4,1.5,1462,118,1250,0
6,1,71661835,135000.0,760,2019,6,2.0,2315,98,12000,0
7,1,71701065,240000.0,343,2364,4,2.0,1346,38,77972,0
8,1,71716837,440000.0,42,2090,3,3.0,2046,81,42178,1
9,1,71745072,215000.0,1369,1453,3,3.5,2112,31,18002,2


In [70]:
#read in image count file
image_counts = pd.read_csv('./outputs/image_count.csv')
image_counts.drop('Unnamed: 0',axis=1,inplace=True)
image_counts.head()

Unnamed: 0,MLS_num,count
0,72007367,12
1,72029311,10
2,72013464,10
3,72027618,10
4,72003490,10


In [71]:
#merge test and control data with image count file on MLS num
merged_test_and_control_data = (
    merged_test_and_control_data.merge(image_counts,left_on='MLSNUM',right_on='MLS_num',how='left').
    rename(columns={'count':'IMAGES'}).
    drop('MLS_num',axis=1).
    drop_duplicates()
)

#Fill in NaNs and change back to integer
merged_test_and_control_data['IMAGES'].fillna(0,inplace=True)
merged_test_and_control_data['IMAGES']= merged_test_and_control_data['IMAGES'].astype('int64')
merged_test_and_control_data

Unnamed: 0,FLIPPABLE,MLSNUM,SOLDPRICE,DOM,ZIP,BEDS,BATHS,SQFT,AGE,LOTSIZE,GARAGE,IMAGES
0,1,71425748,50000,1077,2740,4,2.0,1984,95,1888,0,0
1,1,71477127,125000,724,2368,4,2.0,2333,67,10000,0,0
2,1,71478495,720500,828,1532,4,3.5,4196,20,117176,3,0
3,1,71485790,160000,1140,1746,3,1.5,1350,58,17860,0,4
4,1,71614526,335000,809,2649,3,2.0,1270,38,5401,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
133087,0,72252482,370000,117,2537,0,0.0,2337,30,40075,2,0
133126,0,72276079,695000,15,2472,4,2.0,1722,73,6000,0,0
133167,0,72270032,380000,33,1938,6,2.5,1942,246,43560,0,0
133340,0,72279039,267000,20,2865,3,2.0,1822,13,4356,1,0


In [72]:
#save merged file
merged_test_and_control_data.to_csv('./outputs/merged_test_and_control_data.csv')

# Profit margins

This will spit out a csv of pre and post values of the FLIPs, with a profit margin:

In [73]:
# build flip set
flips = [];
for mls in flippedHomes['MLSNUM']:
    homes = getHome(flippedHomes, mls)
    # check if home has 2 mls rows
    if(homes.shape[0] >= 2):
        pre = homes.iloc[0]
        post = homes.iloc[1]
        # check if it's not the same mls number
        if pre['MLSNUM'] == post['MLSNUM']:
            continue
        # find pre and post flip values
        if pd.to_datetime(pre['SOLDDATE']) > pd.to_datetime(post['SOLDDATE']):
            # post < pre: swap them
            swap = pre
            pre = post
            post = swap
        flips.append({
            'PRE-FLIP-MLS': pre['MLSNUM'],
            'POST-FLIP-MLS': post['MLSNUM'],
            'PRE-FLIP-SOLD-DATE': pre['SOLDDATE'],
            'POST-FLIP-SOLD-DATE': post['SOLDDATE'],
            'PRE-SOLD-PRICE': pre['SOLDPRICE'],
            'POST-SOLD-PRICE': post['SOLDPRICE'],
            'PRE-SOLD-PRICE': pre['SOLDPRICE'],
            'POST-SOLD-PRICE': post['SOLDPRICE'],
            'PROFIT': int(post['SOLDPRICE']) - int(pre['SOLDPRICE'])
        })
        
flips = pd.DataFrame.from_dict(flips)
flips.to_csv('./outputs/flips.csv')
flips.head()

Unnamed: 0,PRE-FLIP-MLS,POST-FLIP-MLS,PRE-FLIP-SOLD-DATE,POST-FLIP-SOLD-DATE,PRE-SOLD-PRICE,POST-SOLD-PRICE,PROFIT
0,71425748.0,72069748.0,2/4/2016,1/20/2017,50000,185000,135000
1,71477127.0,72073263.0,2/24/2016,11/18/2016,125000,395000,270000
2,71478495.0,72246808.0,1/15/2016,12/11/2017,720500,821000,100500
3,71485790.0,72098121.0,9/8/2016,1/12/2017,160000,320000,160000
4,71614526.0,72131597.0,3/1/2016,5/1/2017,335000,425000,90000
