In [283]:
#Import all the necessary libraries

import pandas as pd  
import os                
from datetime import datetime
import time
import matplotlib.pyplot as plt
from math import sin, cos, tan
import matplotlib.dates as mdates
import pgeocode

In [284]:
#Load in all of the data files and combine them into a single DataFrame
# Directory where your files are located
directory = './Data/'
dfs = []

#loop through all the files in a data directory
for filename in os.listdir(directory):
    #ignore a file start with the name weekly, thats the mortageg rates data 
    if not filename.startswith("weekly"):
        #read file path 
        filepath = os.path.join(directory, filename)
        #read a file in a dataframe
        df = pd.read_csv(filepath, encoding='ISO-8859-1')
        #append the file in dictionary 
        dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)    
combined_df.head(10)

Unnamed: 0.1,MLS #,Class,Property Type,Address,City,Zip,Neighborhood,Subdivision,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price,Unnamed: 0
0,10010197,RESIDENTIAL,Single Family Residence,568 Leven Drive,Gibsonville,27249,,Edinborough,4.0,3.0,2644,0-.25 Acres,2023.0,2/6/2024,5/24/2024,0.0,"$372,490",$367,
1,LP720573,RESIDENTIAL,Single Family Residence,735 NC 24 Highway,Cameron,28326,,,3.0,1.0,0,3-5.9 Acres,1950.0,2/29/2024,4/16/2024,0.0,"$210,000","$21,000",
2,LP725056,RESIDENTIAL,Single Family Residence,872 Danish Drive,Fayetteville,28303,,,3.0,2.0,1230,,1989.0,5/9/2024,6/13/2024,0.0,"$210,000","$21,000",
3,10016080,RESIDENTIAL,Single Family Residence,2317 Slater Avenue,Fayetteville,28301,,Not in a Subdivision,2.0,1.0,602,0-.25 Acres,1947.0,3/8/2024,4/1/2024,3.0,"$65,000","$40,000",
4,10016675,RESIDENTIAL,Single Family Residence,301 S John Street,Goldsboro,27530,,Not in a Subdivision,5.0,4.0,4906,.26-.5 Acres,1843.0,3/11/2024,4/16/2024,23.0,"$78,500","$40,000",
5,10022872,RESIDENTIAL,Single Family Residence,609 S Pine Street,Rocky Mount,27803,,Not in a Subdivision,2.0,1.0,1144,0-.25 Acres,1923.0,4/12/2024,4/26/2024,1.0,"$39,000","$40,000",
6,LP727319,RESIDENTIAL,Single Family Residence,514 14th Street,Lumberton,28358,,,2.0,1.0,1000,,1935.0,6/7/2024,6/13/2024,4.0,"$45,000","$45,000",
7,LP711815,RESIDENTIAL,Single Family Residence,611 W Massachusetts Avenue,Southern Pines,28387,,,2.0,1.0,0,,1941.0,9/6/2023,4/11/2024,134.0,"$55,000","$45,000",
8,10018705,RESIDENTIAL,Single Family Residence,102 Duval Drive,New Bern,28560,,Not in a Subdivision,5.0,3.0,2930,.51-.75 Acres,2009.0,3/22/2024,6/6/2024,24.0,"$455,000","$45,000",
9,LP717843,RESIDENTIAL,Manufactured,1380 Iona Church Road,Rowland,28383,,,3.0,2.0,2133,,1997.0,1/4/2024,6/3/2024,103.0,"$65,000","$45,000",


In [285]:
#Checking how many values are null per column
combined_df.isnull().sum()

MLS #                          0
Class                          0
Property Type                  0
Address                        0
City                           0
Zip                           42
Neighborhood              162151
Subdivision                17877
Bedrooms                      18
Total Baths                    0
Total Living Area SqFt         0
Acres                      10856
Year Built                    17
List Date                      0
Closing Date                   0
Days On Market                 0
List Price                     0
Sold Price                     0
Unnamed: 0                168797
dtype: int64

In [286]:
#Checking how many values are null per column
combined_df.describe

<bound method NDFrame.describe of            MLS #        Class            Property Type  \
0       10010197  RESIDENTIAL  Single Family Residence   
1       LP720573  RESIDENTIAL  Single Family Residence   
2       LP725056  RESIDENTIAL  Single Family Residence   
3       10016080  RESIDENTIAL  Single Family Residence   
4       10016675  RESIDENTIAL  Single Family Residence   
...          ...          ...                      ...   
168792   2190197  RESIDENTIAL  Single Family Residence   
168793   2374394  RESIDENTIAL  Single Family Residence   
168794   2295506  RESIDENTIAL  Single Family Residence   
168795   2371485  RESIDENTIAL  Single Family Residence   
168796   2171320  RESIDENTIAL  Single Family Residence   

                          Address          City    Zip Neighborhood  \
0                 568 Leven Drive   Gibsonville  27249          NaN   
1               735 NC 24 Highway       Cameron  28326          NaN   
2                872 Danish Drive  Fayetteville  28303  

In [287]:
#Checking df Info
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168797 entries, 0 to 168796
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   MLS #                   168797 non-null  object 
 1   Class                   168797 non-null  object 
 2   Property Type           168797 non-null  object 
 3   Address                 168797 non-null  object 
 4   City                    168797 non-null  object 
 5   Zip                     168755 non-null  object 
 6   Neighborhood            6646 non-null    object 
 7   Subdivision             150920 non-null  object 
 8   Bedrooms                168779 non-null  float64
 9   Total Baths             168797 non-null  float64
 10  Total Living Area SqFt  168797 non-null  object 
 11  Acres                   157941 non-null  object 
 12  Year Built              168780 non-null  float64
 13  List Date               168797 non-null  object 
 14  Closing Date        

In [288]:
combined_df.columns

Index(['MLS #', 'Class', 'Property Type', 'Address', 'City', 'Zip',
       'Neighborhood', 'Subdivision', 'Bedrooms', 'Total Baths',
       'Total Living Area SqFt', 'Acres', 'Year Built', 'List Date',
       'Closing Date', 'Days On Market', 'List Price', 'Sold Price',
       'Unnamed: 0'],
      dtype='object')

In [289]:
combined_df.value_counts()

Series([], Name: count, dtype: int64)

In [290]:
combined_df['Acres'].unique()

array(['0-.25 Acres', '3-5.9 Acres', nan, '.26-.5 Acres', '.51-.75 Acres',
       '1-2.9 Acres', '.76-.99 Acres', '11+ Acres', '6-10.9 Acres'],
      dtype=object)

In [291]:
# Drop the 'Neighborhood' column
combined_df = combined_df.drop('Neighborhood', axis=1)

# Drop the 'Subdivision' column
combined_df = combined_df.drop('Subdivision', axis=1)

# Drop the 'Unnamed: 0' column
combined_df = combined_df.drop('Unnamed: 0', axis=1)

combined_df.isnull().sum()

MLS #                         0
Class                         0
Property Type                 0
Address                       0
City                          0
Zip                          42
Bedrooms                     18
Total Baths                   0
Total Living Area SqFt        0
Acres                     10856
Year Built                   17
List Date                     0
Closing Date                  0
Days On Market                0
List Price                    0
Sold Price                    0
dtype: int64

In [292]:
#Drop the rows with missing values
combined_df.dropna(inplace=True)

combined_df

Unnamed: 0,MLS #,Class,Property Type,Address,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price
0,10010197,RESIDENTIAL,Single Family Residence,568 Leven Drive,Gibsonville,27249,4.0,3.0,2644,0-.25 Acres,2023.0,2/6/2024,5/24/2024,0.0,"$372,490",$367
1,LP720573,RESIDENTIAL,Single Family Residence,735 NC 24 Highway,Cameron,28326,3.0,1.0,0,3-5.9 Acres,1950.0,2/29/2024,4/16/2024,0.0,"$210,000","$21,000"
3,10016080,RESIDENTIAL,Single Family Residence,2317 Slater Avenue,Fayetteville,28301,2.0,1.0,602,0-.25 Acres,1947.0,3/8/2024,4/1/2024,3.0,"$65,000","$40,000"
4,10016675,RESIDENTIAL,Single Family Residence,301 S John Street,Goldsboro,27530,5.0,4.0,4906,.26-.5 Acres,1843.0,3/11/2024,4/16/2024,23.0,"$78,500","$40,000"
5,10022872,RESIDENTIAL,Single Family Residence,609 S Pine Street,Rocky Mount,27803,2.0,1.0,1144,0-.25 Acres,1923.0,4/12/2024,4/26/2024,1.0,"$39,000","$40,000"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168792,2190197,RESIDENTIAL,Single Family Residence,12304 Birchfalls Drive,Raleigh,27614,5.0,8.0,11884,1-2.9 Acres,2002.0,5/8/2018,3/5/2021,785.0,"$3,000,000","$2,603,000"
168793,2374394,RESIDENTIAL,Single Family Residence,5005 Avalaire Pines Drive,Raleigh,27614,4.0,6.0,7395,1-2.9 Acres,2021.0,3/26/2021,3/30/2021,0.0,"$2,689,785","$2,689,785"
168794,2295506,RESIDENTIAL,Single Family Residence,12404 Birchfalls Drive,Raleigh,27614,6.0,10.0,11472,1-2.9 Acres,2015.0,1/9/2020,1/25/2021,340.0,"$2,999,999","$2,875,000"
168795,2371485,RESIDENTIAL,Single Family Residence,4925 Avalaire Pines Drive,Raleigh,27614,4.0,6.0,7722,1-2.9 Acres,2021.0,3/11/2021,3/30/2021,0.0,"$3,024,750","$3,024,750"


In [293]:
# Replace float Acres values with simplified INT values in the 'Acres' column
combined_df['Acres'] = combined_df['Acres'].replace('0-.25 Acres', 1)
combined_df['Acres'] = combined_df['Acres'].replace('.26-.5 Acres', 2)
combined_df['Acres'] = combined_df['Acres'].replace('.51-.75 Acres', 3)
combined_df['Acres'] = combined_df['Acres'].replace('.76-.99 Acres', 4)
combined_df['Acres'] = combined_df['Acres'].replace('1-2.9 Acres', 5)
combined_df['Acres'] = combined_df['Acres'].replace('3-5.9 Acres', 6)
combined_df['Acres'] = combined_df['Acres'].replace('6-10.9 Acres', 7)
combined_df['Acres'] = combined_df['Acres'].replace('11+ Acres', 8)

#Convert the Acres column to an integer
combined_df['Acres'] = combined_df['Acres'].astype(int)

combined_df['Acres'].unique()


  combined_df['Acres'] = combined_df['Acres'].replace('11+ Acres', 8)


array([1, 6, 2, 3, 5, 4, 8, 7])

In [294]:
#Convert the Year Build column to an integer
combined_df['Year Built'] = combined_df['Year Built'].astype(int)

In [295]:
#Covert Zip to a string so we can slice it
combined_df['Zip'] = combined_df['Zip'].astype(str)

# Slice the 'Zip' column to just the first 5 characters
combined_df['Zip'] = combined_df['Zip'].str.slice(0, 5)

#Convert the Zip column to an integer
combined_df['Zip'] = combined_df['Zip'].astype(int)

#combined_df.isnull().sum()
#combined_df['Zip'].unique()



In [296]:
# Convert the 'List Date' and 'Closing Date' columns to datetime
combined_df['List Date'] = pd.to_datetime(combined_df['List Date'])
combined_df['Closing Date'] = pd.to_datetime(combined_df['Closing Date'])

# Calculate the number of days between 'List Date' and 'Closing Date'
combined_df['Days Between'] = (combined_df['Closing Date'] - combined_df['List Date']).dt.days

combined_df

Unnamed: 0,MLS #,Class,Property Type,Address,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price,Days Between
0,10010197,RESIDENTIAL,Single Family Residence,568 Leven Drive,Gibsonville,27249,4.0,3.0,2644,1,2023,2024-02-06,2024-05-24,0.0,"$372,490",$367,108
1,LP720573,RESIDENTIAL,Single Family Residence,735 NC 24 Highway,Cameron,28326,3.0,1.0,0,6,1950,2024-02-29,2024-04-16,0.0,"$210,000","$21,000",47
3,10016080,RESIDENTIAL,Single Family Residence,2317 Slater Avenue,Fayetteville,28301,2.0,1.0,602,1,1947,2024-03-08,2024-04-01,3.0,"$65,000","$40,000",24
4,10016675,RESIDENTIAL,Single Family Residence,301 S John Street,Goldsboro,27530,5.0,4.0,4906,2,1843,2024-03-11,2024-04-16,23.0,"$78,500","$40,000",36
5,10022872,RESIDENTIAL,Single Family Residence,609 S Pine Street,Rocky Mount,27803,2.0,1.0,1144,1,1923,2024-04-12,2024-04-26,1.0,"$39,000","$40,000",14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168792,2190197,RESIDENTIAL,Single Family Residence,12304 Birchfalls Drive,Raleigh,27614,5.0,8.0,11884,5,2002,2018-05-08,2021-03-05,785.0,"$3,000,000","$2,603,000",1032
168793,2374394,RESIDENTIAL,Single Family Residence,5005 Avalaire Pines Drive,Raleigh,27614,4.0,6.0,7395,5,2021,2021-03-26,2021-03-30,0.0,"$2,689,785","$2,689,785",4
168794,2295506,RESIDENTIAL,Single Family Residence,12404 Birchfalls Drive,Raleigh,27614,6.0,10.0,11472,5,2015,2020-01-09,2021-01-25,340.0,"$2,999,999","$2,875,000",382
168795,2371485,RESIDENTIAL,Single Family Residence,4925 Avalaire Pines Drive,Raleigh,27614,4.0,6.0,7722,5,2021,2021-03-11,2021-03-30,0.0,"$3,024,750","$3,024,750",19


In [297]:
# Count the number of 0 values in the 'Total Living Area SqFt' column
num_zeros = combined_df['Total Living Area SqFt'].value_counts().get(0, 0)
print(num_zeros)

16387


  num_zeros = combined_df['Total Living Area SqFt'].value_counts().get(0, 0)


In [298]:
#Convert the Bedrooms column to an integer
combined_df['Bedrooms'] = combined_df['Bedrooms'].astype(int)
combined_df['Bedrooms'].unique()

array([ 4,  3,  2,  5,  1,  0,  7,  6,  8, 13, 14, 10, 43,  9, 11, 44])

In [299]:
#Convert the Total Baths column to an integer
combined_df['Total Baths'] = combined_df['Total Baths'].astype(int)
combined_df['Total Baths'].unique()

array([ 3,  1,  4,  2,  0,  5,  8,  6,  7,  9, 10, 11, 12, 17, 32, 14, 26,
       34, 23, 22])

In [300]:
#Define the number of zeros in the 'Total Baths' column
num_zeros = combined_df['Total Baths'].value_counts().get(0, 0)
print(num_zeros)

50


In [301]:
# Drop all the rows that have a 0 in the 'Total Baths' column
combined_df = combined_df[combined_df['Total Baths'] != 0]

#Recheck the number of zeros in the 'Total Baths' column
num_zeros = combined_df['Total Baths'].value_counts().get(0, 0)
print(num_zeros)

0


In [302]:
#Covert Sqft to a string so we can remove the commas
combined_df['Total Living Area SqFt'] = combined_df['Total Living Area SqFt'].astype(str)

# Remove commas from the 'Total Living Area SqFt' column
combined_df['Total Living Area SqFt'] = combined_df['Total Living Area SqFt'].str.replace(',', '')

# Convert the 'Total Living Area SqFt' column to an integer
combined_df['Total Living Area SqFt'] = combined_df['Total Living Area SqFt'].astype(int)

# Drop all the rows that have a 0 in the 'Total Baths' column
combined_df = combined_df[combined_df['Total Living Area SqFt'] != 0]

num_zeros = combined_df['Total Living Area SqFt'].value_counts().get(0, 0)
print(num_zeros)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['Total Living Area SqFt'] = combined_df['Total Living Area SqFt'].astype(str)


0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['Total Living Area SqFt'] = combined_df['Total Living Area SqFt'].str.replace(',', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['Total Living Area SqFt'] = combined_df['Total Living Area SqFt'].astype(int)


In [303]:
#Covert List Price and Sold Price to a string so we can remove the dollar sign and commas
combined_df['List Price'] = combined_df['List Price'].astype(str)
combined_df['Sold Price'] = combined_df['Sold Price'].astype(str)

# Remove dollar signs and commas from the 'List Price' and 'Sold Price' column
combined_df['List Price'] = combined_df['List Price'].replace('[$,]', '', regex=True)
combined_df['Sold Price'] = combined_df['Sold Price'].replace('[$,]', '', regex=True)

# Convert the 'List Price' and 'Sold Price' columns to integers
combined_df['List Price'] = combined_df['List Price'].astype(int)
combined_df['Sold Price'] = combined_df['Sold Price'].astype(int)

# Calculate the profit made on each house
combined_df['Profit'] = combined_df['Sold Price'] - combined_df['List Price']

combined_df.head(10)




Unnamed: 0,MLS #,Class,Property Type,Address,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price,Days Between,Profit
0,10010197,RESIDENTIAL,Single Family Residence,568 Leven Drive,Gibsonville,27249,4,3,2644,1,2023,2024-02-06,2024-05-24,0.0,372490,367,108,-372123
3,10016080,RESIDENTIAL,Single Family Residence,2317 Slater Avenue,Fayetteville,28301,2,1,602,1,1947,2024-03-08,2024-04-01,3.0,65000,40000,24,-25000
4,10016675,RESIDENTIAL,Single Family Residence,301 S John Street,Goldsboro,27530,5,4,4906,2,1843,2024-03-11,2024-04-16,23.0,78500,40000,36,-38500
5,10022872,RESIDENTIAL,Single Family Residence,609 S Pine Street,Rocky Mount,27803,2,1,1144,1,1923,2024-04-12,2024-04-26,1.0,39000,40000,14,1000
8,10018705,RESIDENTIAL,Single Family Residence,102 Duval Drive,New Bern,28560,5,3,2930,3,2009,2024-03-22,2024-06-06,24.0,455000,45000,76,-410000
10,10007930,RESIDENTIAL,Single Family Residence,703 Elm Street,Weldon,27890,2,2,880,1,1900,2024-01-24,2024-04-05,43.0,70000,50000,72,-20000
11,10014026,RESIDENTIAL,Manufactured,144 Henderson Road,Roxboro,27573,3,1,684,2,1965,2024-02-27,2024-04-09,1.0,45000,50000,42,5000
12,2537002,RESIDENTIAL,Single Family Residence,104 N Adkin Street,Kinston,28501,2,1,1198,1,1923,2023-10-12,2024-04-01,137.0,63000,50000,172,-13000
14,10021849,RESIDENTIAL,Single Family Residence,2234 Pinpoint Road,Fayetteville,28312,3,1,1080,2,1952,2024-04-09,2024-05-23,10.0,52000,52000,44,0
15,10010602,RESIDENTIAL,Single Family Residence,511 Brock Road,Goldsboro,27530,2,1,775,4,1954,2024-02-08,2024-04-04,42.0,52500,54000,56,1500


In [304]:
# Count the number of negative numbers in the 'Profit' column
num_negative = combined_df[combined_df['Profit'] < 0].shape[0]
print(num_negative)

42272


In [305]:
# Count the number of numbers in the 'Profit' column that are less than -10000
num_negative = combined_df[combined_df['Profit'] < -100000].shape[0]
num_negative

672

In [306]:
combined_df.head(10)

Unnamed: 0,MLS #,Class,Property Type,Address,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price,Days Between,Profit
0,10010197,RESIDENTIAL,Single Family Residence,568 Leven Drive,Gibsonville,27249,4,3,2644,1,2023,2024-02-06,2024-05-24,0.0,372490,367,108,-372123
3,10016080,RESIDENTIAL,Single Family Residence,2317 Slater Avenue,Fayetteville,28301,2,1,602,1,1947,2024-03-08,2024-04-01,3.0,65000,40000,24,-25000
4,10016675,RESIDENTIAL,Single Family Residence,301 S John Street,Goldsboro,27530,5,4,4906,2,1843,2024-03-11,2024-04-16,23.0,78500,40000,36,-38500
5,10022872,RESIDENTIAL,Single Family Residence,609 S Pine Street,Rocky Mount,27803,2,1,1144,1,1923,2024-04-12,2024-04-26,1.0,39000,40000,14,1000
8,10018705,RESIDENTIAL,Single Family Residence,102 Duval Drive,New Bern,28560,5,3,2930,3,2009,2024-03-22,2024-06-06,24.0,455000,45000,76,-410000
10,10007930,RESIDENTIAL,Single Family Residence,703 Elm Street,Weldon,27890,2,2,880,1,1900,2024-01-24,2024-04-05,43.0,70000,50000,72,-20000
11,10014026,RESIDENTIAL,Manufactured,144 Henderson Road,Roxboro,27573,3,1,684,2,1965,2024-02-27,2024-04-09,1.0,45000,50000,42,5000
12,2537002,RESIDENTIAL,Single Family Residence,104 N Adkin Street,Kinston,28501,2,1,1198,1,1923,2023-10-12,2024-04-01,137.0,63000,50000,172,-13000
14,10021849,RESIDENTIAL,Single Family Residence,2234 Pinpoint Road,Fayetteville,28312,3,1,1080,2,1952,2024-04-09,2024-05-23,10.0,52000,52000,44,0
15,10010602,RESIDENTIAL,Single Family Residence,511 Brock Road,Goldsboro,27530,2,1,775,4,1954,2024-02-08,2024-04-04,42.0,52500,54000,56,1500


In [307]:
# Sort the DataFrame by the 'Profit' column in descending order
combined_df=combined_df.sort_values('Profit', ascending=False)
combined_df.head(10)    

Unnamed: 0,MLS #,Class,Property Type,Address,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price,Days Between,Profit
10656,10009864,RESIDENTIAL,Single Family Residence,100 Wall Creek Drive,Rolesville,27571,5,3,2981,2,1999,2024-02-03,2024-06-14,102.0,669000,6395000,132,5726000
10644,2531856,RESIDENTIAL,Single Family Residence,12 Stonecliff Lane,Sanford,27332,4,3,2017,5,2023,2023-09-12,2024-05-10,0.0,319999,3249999,241,2930000
10646,2485092,RESIDENTIAL,Single Family Residence,20 E Saint Andrews Drive,Zebulon,27597,3,3,2072,3,2023,2022-11-26,2024-05-15,0.0,366300,3269300,536,2903000
138191,2372641,RESIDENTIAL,Single Family Residence,2211 Wheeler Road,Raleigh,27607,5,8,9376,2,2022,2021-03-16,2022-02-23,0.0,1850000,3795000,344,1945000
147685,2410217,RESIDENTIAL,Single Family Residence,7241 Burlington Road,Whitsett,27377,5,7,6983,8,1883,2021-09-27,2023-01-09,102.0,1300000,3240000,469,1940000
25516,2375877,RESIDENTIAL,Single Family Residence,1129 Shoaf Stone,Wake Forest,27587,5,5,7200,7,2005,2021-04-05,2021-05-27,0.0,1800000,2900000,52,1100000
116764,2378933,RESIDENTIAL,Single Family Residence,1616 Hunting Ridge Road,Raleigh,27615,5,7,6289,3,2022,2021-04-20,2022-09-28,0.0,2550000,3577591,526,1027591
64333,2495996,RESIDENTIAL,Single Family Residence,1490 Olives Chapel Road,Apex,27502,4,6,7266,5,2023,2023-02-21,2023-11-21,0.0,1755900,2598548,273,842648
126968,2293916,RESIDENTIAL,Single Family Residence,106 Lochinvar Court,Cary,27511,5,6,7069,3,2022,2019-12-27,2022-12-30,0.0,2550000,3296301,1099,746301
90498,2448408,RESIDENTIAL,Single Family Residence,3424 Bellevue Road,Raleigh,27609,5,7,7004,3,1938,2022-05-11,2022-06-01,3.0,3500000,4200000,21,700000


In [308]:
# Edit the first occurrence of '6395000' in the 'Sold Price' column to '639500'
index = (combined_df['Sold Price'] == 6395000).idxmax()
combined_df.loc[index, 'Sold Price'] = 639500

# Edit the first occurrence of '3249999' in the 'Sold Price' column to '324999'
index = (combined_df['Sold Price'] == 3249999).idxmax()
combined_df.loc[index, 'Sold Price'] = 324999

# Edit the first occurrence of '3269300' in the 'Sold Price' column to '326930'
index = (combined_df['Sold Price'] == 3269300).idxmax()
combined_df.loc[index, 'Sold Price'] = 326930

# Recalculate the profit made on each house
combined_df['Profit'] = combined_df['Sold Price'] - combined_df['List Price']

# Sort the DataFrame by the 'Profit' column in descending order
combined_df=combined_df.sort_values('Profit', ascending=False)

combined_df

Unnamed: 0,MLS #,Class,Property Type,Address,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price,Days Between,Profit
138191,2372641,RESIDENTIAL,Single Family Residence,2211 Wheeler Road,Raleigh,27607,5,8,9376,2,2022,2021-03-16,2022-02-23,0.0,1850000,3795000,344,1945000
147685,2410217,RESIDENTIAL,Single Family Residence,7241 Burlington Road,Whitsett,27377,5,7,6983,8,1883,2021-09-27,2023-01-09,102.0,1300000,3240000,469,1940000
25516,2375877,RESIDENTIAL,Single Family Residence,1129 Shoaf Stone,Wake Forest,27587,5,5,7200,7,2005,2021-04-05,2021-05-27,0.0,1800000,2900000,52,1100000
116764,2378933,RESIDENTIAL,Single Family Residence,1616 Hunting Ridge Road,Raleigh,27615,5,7,6289,3,2022,2021-04-20,2022-09-28,0.0,2550000,3577591,526,1027591
64333,2495996,RESIDENTIAL,Single Family Residence,1490 Olives Chapel Road,Apex,27502,4,6,7266,5,2023,2023-02-21,2023-11-21,0.0,1755900,2598548,273,842648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116763,2433596,RESIDENTIAL,Single Family Residence,719 Weaver Dairy Road,Chapel Hill,27514,2,1,1214,7,1932,2022-02-26,2022-07-08,2.0,4300000,3550000,132,-750000
157479,2532421,RESIDENTIAL,Single Family Residence,7728 Grace Cove Lane,Wake Forest,27587,5,6,8767,8,2008,2023-09-15,2024-02-16,91.0,4900000,4080467,154,-819533
116766,2456529,RESIDENTIAL,Single Family Residence,223 Shady Drive,Burlington,27215,6,14,17988,5,1996,2022-06-17,2022-08-29,47.0,5900000,4702000,73,-1198000
76274,2357404,RESIDENTIAL,Single Family Residence,3319 White Oak Road,Raleigh,27609,6,11,20283,5,2000,2020-12-10,2023-08-15,931.0,4995000,3000000,978,-1995000


In [309]:
combined_df.sort_values('Profit', ascending=True)

Unnamed: 0,MLS #,Class,Property Type,Address,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price,Days Between,Profit
5420,10036327,RESIDENTIAL,Single Family Residence,1216 Caledon Drive,Mebane,27302,3,2,1860,1,2020,2024-04-22,2024-06-18,0.0,3837737,383737,57,-3454000
76274,2357404,RESIDENTIAL,Single Family Residence,3319 White Oak Road,Raleigh,27609,6,11,20283,5,2000,2020-12-10,2023-08-15,931.0,4995000,3000000,978,-1995000
116766,2456529,RESIDENTIAL,Single Family Residence,223 Shady Drive,Burlington,27215,6,14,17988,5,1996,2022-06-17,2022-08-29,47.0,5900000,4702000,73,-1198000
157479,2532421,RESIDENTIAL,Single Family Residence,7728 Grace Cove Lane,Wake Forest,27587,5,6,8767,8,2008,2023-09-15,2024-02-16,91.0,4900000,4080467,154,-819533
116763,2433596,RESIDENTIAL,Single Family Residence,719 Weaver Dairy Road,Chapel Hill,27514,2,1,1214,7,1932,2022-02-26,2022-07-08,2.0,4300000,3550000,132,-750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64333,2495996,RESIDENTIAL,Single Family Residence,1490 Olives Chapel Road,Apex,27502,4,6,7266,5,2023,2023-02-21,2023-11-21,0.0,1755900,2598548,273,842648
116764,2378933,RESIDENTIAL,Single Family Residence,1616 Hunting Ridge Road,Raleigh,27615,5,7,6289,3,2022,2021-04-20,2022-09-28,0.0,2550000,3577591,526,1027591
25516,2375877,RESIDENTIAL,Single Family Residence,1129 Shoaf Stone,Wake Forest,27587,5,5,7200,7,2005,2021-04-05,2021-05-27,0.0,1800000,2900000,52,1100000
147685,2410217,RESIDENTIAL,Single Family Residence,7241 Burlington Road,Whitsett,27377,5,7,6983,8,1883,2021-09-27,2023-01-09,102.0,1300000,3240000,469,1940000


In [310]:
# Edit the first occurrence of '3837737' in the 'List Price' column to '383737'
index = (combined_df['List Price'] == 3837737).idxmax()
combined_df.loc[index, 'List Price'] = 383737

# Edit the first occurrence of '3249999' in the 'Sold Price' column to '324999'
index = (combined_df['Sold Price'] == 3249999).idxmax()
combined_df.loc[index, 'Sold Price'] = 324999

# Recalculate the profit made on each house
combined_df['Profit'] = combined_df['Sold Price'] - combined_df['List Price']

# Sort the DataFrame by the 'Profit' column in descending order
combined_df.sort_values('Profit', ascending=True)

Unnamed: 0,MLS #,Class,Property Type,Address,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,Days On Market,List Price,Sold Price,Days Between,Profit
76274,2357404,RESIDENTIAL,Single Family Residence,3319 White Oak Road,Raleigh,27609,6,11,20283,5,2000,2020-12-10,2023-08-15,931.0,4995000,3000000,978,-1995000
138191,2372641,RESIDENTIAL,Single Family Residence,2211 Wheeler Road,Raleigh,27607,5,8,9376,2,2022,2021-03-16,2022-02-23,0.0,1850000,324999,344,-1525001
116766,2456529,RESIDENTIAL,Single Family Residence,223 Shady Drive,Burlington,27215,6,14,17988,5,1996,2022-06-17,2022-08-29,47.0,5900000,4702000,73,-1198000
157479,2532421,RESIDENTIAL,Single Family Residence,7728 Grace Cove Lane,Wake Forest,27587,5,6,8767,8,2008,2023-09-15,2024-02-16,91.0,4900000,4080467,154,-819533
116763,2433596,RESIDENTIAL,Single Family Residence,719 Weaver Dairy Road,Chapel Hill,27514,2,1,1214,7,1932,2022-02-26,2022-07-08,2.0,4300000,3550000,132,-750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126968,2293916,RESIDENTIAL,Single Family Residence,106 Lochinvar Court,Cary,27511,5,6,7069,3,2022,2019-12-27,2022-12-30,0.0,2550000,3296301,1099,746301
64333,2495996,RESIDENTIAL,Single Family Residence,1490 Olives Chapel Road,Apex,27502,4,6,7266,5,2023,2023-02-21,2023-11-21,0.0,1755900,2598548,273,842648
116764,2378933,RESIDENTIAL,Single Family Residence,1616 Hunting Ridge Road,Raleigh,27615,5,7,6289,3,2022,2021-04-20,2022-09-28,0.0,2550000,3577591,526,1027591
25516,2375877,RESIDENTIAL,Single Family Residence,1129 Shoaf Stone,Wake Forest,27587,5,5,7200,7,2005,2021-04-05,2021-05-27,0.0,1800000,2900000,52,1100000


In [311]:
#Drop the MLS Number, Days On Market, and Class column
#combined_df = combined_df.drop('MLS #', axis=1) - This column is needed for the final output as a UID

#Drop Unnecessary Columns
combined_df = combined_df.drop('Days On Market', axis=1)
combined_df = combined_df.drop('Class', axis=1)
combined_df = combined_df.drop(columns=['Address'])

In [312]:
combined_df.head(10)

Unnamed: 0,MLS #,Property Type,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,List Price,Sold Price,Days Between,Profit
138191,2372641,Single Family Residence,Raleigh,27607,5,8,9376,2,2022,2021-03-16,2022-02-23,1850000,324999,344,-1525001
147685,2410217,Single Family Residence,Whitsett,27377,5,7,6983,8,1883,2021-09-27,2023-01-09,1300000,3240000,469,1940000
25516,2375877,Single Family Residence,Wake Forest,27587,5,5,7200,7,2005,2021-04-05,2021-05-27,1800000,2900000,52,1100000
116764,2378933,Single Family Residence,Raleigh,27615,5,7,6289,3,2022,2021-04-20,2022-09-28,2550000,3577591,526,1027591
64333,2495996,Single Family Residence,Apex,27502,4,6,7266,5,2023,2023-02-21,2023-11-21,1755900,2598548,273,842648
126968,2293916,Single Family Residence,Cary,27511,5,6,7069,3,2022,2019-12-27,2022-12-30,2550000,3296301,1099,746301
90498,2448408,Single Family Residence,Raleigh,27609,5,7,7004,3,1938,2022-05-11,2022-06-01,3500000,4200000,21,700000
75605,2508624,Single Family Residence,Raleigh,27610,3,2,1450,5,1958,2023-05-03,2023-09-11,249900,850000,131,600100
64332,2428206,Single Family Residence,Apex,27523,4,5,6864,6,2022,2022-01-24,2023-12-18,2000000,2566043,693,566043
90494,2437975,Single Family Residence,Durham,27707,4,5,5713,3,2020,2022-03-22,2022-05-02,2950000,3500000,41,550000


In [313]:

combined_df['Property Type'].unique()

array(['Single Family Residence', 'Condo', 'Manufactured', 'Townhouse',
       'Triplex', 'Duplex', 'Ranch', 'Other', 'Farm', 'Cabin',
       'Quadruplex'], dtype=object)

In [314]:
# Initialize pgeocode Nominatim object for the United States
nomi = pgeocode.Nominatim('us')

# Function to get latitude and longitude
def get_lat_long(zip_code):
    location = nomi.query_postal_code(zip_code)
    return location.latitude, location.longitude

# Apply the function to each zip code
combined_df[['latitude', 'longitude']] = combined_df['Zip'].apply(lambda x: pd.Series(get_lat_long(x)))

combined_df

Unnamed: 0,MLS #,Property Type,City,Zip,Bedrooms,Total Baths,Total Living Area SqFt,Acres,Year Built,List Date,Closing Date,List Price,Sold Price,Days Between,Profit,latitude,longitude
138191,2372641,Single Family Residence,Raleigh,27607,5,8,9376,2,2022,2021-03-16,2022-02-23,1850000,324999,344,-1525001,35.8014,-78.6877
147685,2410217,Single Family Residence,Whitsett,27377,5,7,6983,8,1883,2021-09-27,2023-01-09,1300000,3240000,469,1940000,36.0330,-79.5972
25516,2375877,Single Family Residence,Wake Forest,27587,5,5,7200,7,2005,2021-04-05,2021-05-27,1800000,2900000,52,1100000,35.9815,-78.5392
116764,2378933,Single Family Residence,Raleigh,27615,5,7,6289,3,2022,2021-04-20,2022-09-28,2550000,3577591,526,1027591,35.8887,-78.6393
64333,2495996,Single Family Residence,Apex,27502,4,6,7266,5,2023,2023-02-21,2023-11-21,1755900,2598548,273,842648,35.7225,-78.8408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116763,2433596,Single Family Residence,Chapel Hill,27514,2,1,1214,7,1932,2022-02-26,2022-07-08,4300000,3550000,132,-750000,35.9203,-79.0372
157479,2532421,Single Family Residence,Wake Forest,27587,5,6,8767,8,2008,2023-09-15,2024-02-16,4900000,4080467,154,-819533,35.9815,-78.5392
116766,2456529,Single Family Residence,Burlington,27215,6,14,17988,5,1996,2022-06-17,2022-08-29,5900000,4702000,73,-1198000,36.0318,-79.4889
76274,2357404,Single Family Residence,Raleigh,27609,6,11,20283,5,2000,2020-12-10,2023-08-15,4995000,3000000,978,-1995000,35.8480,-78.6317


In [315]:
#Change the 'Latitude' column name to 'Lat'
combined_df = combined_df.rename(columns={"latitude": "Lat"})

#Change the 'Longitude' column name to 'Lon'
combined_df = combined_df.rename(columns={"longitude": "Lon"})

#Change the 'Total Living Area SqFt' column name to 'SqFt'
combined_df = combined_df.rename(columns={"Total Living Area SqFt": "SqFt"})

#Change the 'Profit' column name to 'Over Asking'  
combined_df = combined_df.rename(columns={"Profit": "Over Asking"})

#Change the 'Days Between' column name to 'Days on Market'
combined_df = combined_df.rename(columns={"Days Between": "Days on Market"})


combined_df.head()

Unnamed: 0,MLS #,Property Type,City,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Date,Closing Date,List Price,Sold Price,Days on Market,Over Asking,Lat,Lon
138191,2372641,Single Family Residence,Raleigh,27607,5,8,9376,2,2022,2021-03-16,2022-02-23,1850000,324999,344,-1525001,35.8014,-78.6877
147685,2410217,Single Family Residence,Whitsett,27377,5,7,6983,8,1883,2021-09-27,2023-01-09,1300000,3240000,469,1940000,36.033,-79.5972
25516,2375877,Single Family Residence,Wake Forest,27587,5,5,7200,7,2005,2021-04-05,2021-05-27,1800000,2900000,52,1100000,35.9815,-78.5392
116764,2378933,Single Family Residence,Raleigh,27615,5,7,6289,3,2022,2021-04-20,2022-09-28,2550000,3577591,526,1027591,35.8887,-78.6393
64333,2495996,Single Family Residence,Apex,27502,4,6,7266,5,2023,2023-02-21,2023-11-21,1755900,2598548,273,842648,35.7225,-78.8408


In [316]:
combined_df.isnull().sum()

MLS #               0
Property Type       0
City                0
Zip                 0
Bedrooms            0
Total Baths         0
SqFt                0
Acres               0
Year Built          0
List Date           0
Closing Date        0
List Price          0
Sold Price          0
Days on Market      0
Over Asking         0
Lat               139
Lon               139
dtype: int64

In [317]:
#Drop the rows with missing values
combined_df.dropna(inplace=True)

combined_df.isnull().sum()

MLS #             0
Property Type     0
City              0
Zip               0
Bedrooms          0
Total Baths       0
SqFt              0
Acres             0
Year Built        0
List Date         0
Closing Date      0
List Price        0
Sold Price        0
Days on Market    0
Over Asking       0
Lat               0
Lon               0
dtype: int64

In [318]:
# Count the number of 0 values in the 'Lat' and 'Lon' columns
num_zeros = combined_df['Lon'].value_counts().get(0, 0)

print(num_zeros)

0


In [319]:
# Save the DataFrame to a CSV file
combined_df.to_csv('Clean_Data/mls_cleaned.csv', index=False)