<div style="font-weight: bold; font-size: x-large; color: blue">***Setup***</div>

In [1]:
import cudf
import pandas as pd

In [2]:
#Returns missing values
def getMissing(df):
    totalRows = df.shape[0]

    temp = cudf.DataFrame({
        "Missing": df.isnull().sum(),
        "Missing %": (df.isnull().sum() / totalRows) * 100
        
    })
    
    temp = temp[temp['Missing'] > 0]
    
    return temp.sort_values(by=['Missing'], ascending=False)  

<div style="font-weight: bold; font-size: x-large; color: blue">***1. Merge Datasets***</div>

<div style="font-weight: bold; font-size: large; color: blue">1a. Housing Data:</div>

In [3]:
#Read
housing_df = cudf.read_csv("Datasets/housing_data.csv")

#Get missing
getMissing(housing_df)

Unnamed: 0,Missing,Missing %
Style,23937,100.0
Garage,23886,99.786941
Street Number 2,23859,99.674145
RT007_ExpiredDate,22791,95.212433
Unit Number,22572,94.297531
Sold Date,5829,24.351422
Sold Price,5827,24.343067
Pending Date,5087,21.251619
Sq Foot,3117,13.021682
Lot Size,505,2.109705


In [4]:
#Filter rows
housing_df = housing_df[
    (housing_df['Status'] == 'Sold') & 
    (housing_df['Property Type'] == 'Single Family') &
    ((housing_df['City Name'] == 'Bakersfield') | (housing_df['City Name'] == 'Delano'))
]

#Drop columns
housing_df.drop([    
    'Status',
    'Property Type',
    #
    'Style',
    'Garage',
    'Street Number 2',    
    'RT007_ExpiredDate',
    'Unit Number',  
    #
    'Pending Date'
], axis=1, inplace=True)

#Set data types
housing_df['ML Number'] = cudf.to_numeric(housing_df['ML Number'])

#Trim
housing_df['Zipcode'] = housing_df['Zipcode'].astype(str).str.strip()

#Extract sold month and year
housing_df['Sold Date'] = cudf.to_datetime(housing_df['Sold Date'], format='%m/%d/%y')

#No need to trim
housing_df['Sold Month & Year'] = housing_df['Sold Date'].dt.year.astype(str) + "-" + housing_df['Sold Date'].dt.month.astype(str)

#Print 'Sold Month & Year'
housing_df[[
    'Sold Date',
    'Sold Month & Year'
]].head()

Unnamed: 0,Sold Date,Sold Month & Year
13,2024-11-22,2024-11
22,2024-10-31,2024-10
37,2024-11-15,2024-11
64,2024-11-20,2024-11
193,2024-11-27,2024-11


<div style="font-weight: bold; font-size: large; color: blue">1b. Mortgage Data:</div>

In [5]:
#Read
mortgage_rates_df = cudf.read_csv("Datasets/mortgage_rates.csv")

#Split into month, day, year
#expand=True: Return DataFrame/MultiIndex expanding dimensionality.
temp = mortgage_rates_df['date'].str.split('/', expand=True)

#Ensure day and month have leading zeros
mortgage_rates_df['date_modified'] = temp[0].str.zfill(2) + '/' + temp[1].str.zfill(2) + '/' + temp[2]

#Extract mortgage month and year
mortgage_rates_df['date_modified'] = cudf.to_datetime(mortgage_rates_df['date_modified'], format='%m/%d/%Y')

#No need to trim
mortgage_rates_df['month_year'] = mortgage_rates_df['date_modified'].dt.year.astype(str) + "-" + mortgage_rates_df['date_modified'].dt.month.astype(str)

#Print 'month_year'
mortgage_rates_df[[
    'date',
    'date_modified',
    'month_year'
]].head()

Unnamed: 0,date,date_modified,month_year
0,4/2/1971,1971-04-02,1971-4
1,4/9/1971,1971-04-09,1971-4
2,4/16/1971,1971-04-16,1971-4
3,4/23/1971,1971-04-23,1971-4
4,4/30/1971,1971-04-30,1971-4


In [6]:
#Aggregate mortgage data
mortgage_rates_agg_df = mortgage_rates_df.groupby(['month_year']).agg(
    pmms30_mean=('pmms30', 'mean'),  
    pmms15_mean=('pmms15', 'mean')
)

#Rename
mortgage_rates_agg_df = mortgage_rates_agg_df.rename(columns={
    'pmms30_mean': 'Avg Monthly Mortgage Rate (30Y)',
    'pmms15_mean': 'Avg Monthly Mortgage Rate (15Y)'
})

#Print
mortgage_rates_agg_df.sort_values('month_year', ascending=False).head(13)

Unnamed: 0_level_0,Avg Monthly Mortgage Rate (30Y),Avg Monthly Mortgage Rate (15Y)
month_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-1,6.96,6.18
2024-9,6.18,5.2625
2024-8,6.5,5.682
2024-7,6.8475,6.135
2024-6,6.9175,6.1875
2024-5,7.06,6.346
2024-4,6.9925,6.2625
2024-3,6.82,6.175
2024-2,6.776,6.102
2024-12,6.715,5.93


In [7]:
#Merge
merged_df = cudf.merge(housing_df, mortgage_rates_agg_df, left_on=['Sold Month & Year'], right_on=['month_year'], how='left', suffixes=('', ''))

#Print
merged_df[[
    'Sold Month & Year',
    'Avg Monthly Mortgage Rate (30Y)',
    'Avg Monthly Mortgage Rate (15Y)'
]].sort_values('Sold Month & Year', ascending=False).head()

Unnamed: 0,Sold Month & Year,Avg Monthly Mortgage Rate (30Y),Avg Monthly Mortgage Rate (15Y)
1264,2024-9,6.18,5.2625
1050,2024-9,6.18,5.2625
1141,2024-9,6.18,5.2625
1317,2024-9,6.18,5.2625
1391,2024-9,6.18,5.2625


In [8]:
#Drop columns
merged_df.drop([    
    'Sold Month & Year'
], axis=1, inplace=True)

<div style="font-weight: bold; font-size: large; color: blue">1c. Commute Scores:</div>

In [9]:
#Read
commute_scores_df = cudf.read_csv("Datasets/commute_scores.csv")

#Filter columns
temp = commute_scores_df[[
    'ML Number',
    'Walk Score',
    'Transit Score',
    'Bike Score'
]]

#Set data types
temp['ML Number'] = cudf.to_numeric(temp['ML Number'])

#Print
temp.sort_values('ML Number').head()

Unnamed: 0,ML Number,Walk Score,Transit Score,Bike Score
22088,21913768,45.0,28.0,38.0
21694,21913793,61.0,30.0,36.0
21707,21914052,24.0,22.0,34.0
21522,21914165,35.0,30.0,36.0
21679,21914280,63.0,35.0,30.0


In [10]:
#Merge
merged_df = cudf.merge(merged_df, temp, left_on=['ML Number'], right_on=['ML Number'], how='left', suffixes=('', ''))

#Print
merged_df[[
    'ML Number',
    'Walk Score',
    'Transit Score',
    'Bike Score'
]].sort_values('ML Number').head()

Unnamed: 0,ML Number,Walk Score,Transit Score,Bike Score
14007,21913768,45.0,28.0,38.0
879,21913793,61.0,30.0,36.0
7287,21914052,24.0,22.0,34.0
9158,21914165,35.0,30.0,36.0
867,21914280,63.0,35.0,30.0


<div style="font-weight: bold; font-size: large; color: blue">1d. School Ratings:</div>

In [11]:
#Read
school_ratings_df = cudf.read_csv("Datasets/school_ratings.csv")

#Filter columns
school_ratings_subset_df = school_ratings_df[[
    'School Name',
    'Grade Classification',
    'Ratings out of 10'
]]

#Trim
school_ratings_subset_df['School Name'] = school_ratings_subset_df['School Name'].str.strip()

school_ratings_subset_df['Grade Classification'] = school_ratings_subset_df['Grade Classification'].str.strip()

In [12]:
#Merge elementary schools
temp = school_ratings_subset_df[school_ratings_subset_df['Grade Classification'] == 'Elementary School']

temp = temp.rename(columns={'Ratings out of 10': "Elementary School Ratings"})

#Print
temp.sort_values('School Name').head()

Unnamed: 0,School Name,Grade Classification,Elementary School Ratings
132,Alicante Avenue,Elementary School,4
0,Almondale,Elementary School,9
1,American,Elementary School,8
3,Beardsley,Elementary School,8
4,Berkshire,Elementary School,8


In [13]:
#Merge
merged_df = cudf.merge(merged_df, temp, left_on=['Elementry School'], right_on=['School Name'], how='left', suffixes=('', ''))

#Print
merged_df[[
    'Elementry School',
    'Elementary School Ratings'
]].drop_duplicates(subset=['Elementry School']).sort_values('Elementry School').head(10)

Unnamed: 0,Elementry School,Elementary School Ratings
1055,Adult Community,
79,Albany Park,
4202,Alicante Avenue,4.0
1253,Almondale,9.0
80,American,8.0
92,Beardsley,8.0
37,Berkshire,8.0
122,Bimat,7.0
127,Browning Road,
9,Buena Vista,7.0


In [14]:
#Merge junior high schools
temp = school_ratings_subset_df[school_ratings_subset_df['Grade Classification'] == 'Junior High School']

temp = temp.rename(columns={'Ratings out of 10': "Junior High School Ratings"})

#Print
temp.sort_values('School Name').head()

Unnamed: 0,School Name,Grade Classification,Junior High School Ratings
7,Actis O. J.,Junior High School,7
8,"Actis, O. J.",Junior High School,7
10,Beardsley,Junior High School,7
11,"Cato, Paul L.",Junior High School,7
15,Chipman,Junior High School,7


In [15]:
#Merge
merged_df = cudf.merge(merged_df, temp, left_on=['Junior High School'], right_on=['School Name'], how='left', suffixes=('', ''))

#Print
merged_df[[
    'Junior High School',
    'Junior High School Ratings'
]].drop_duplicates(subset=['Junior High School']).sort_values('Junior High School').head(10)

Unnamed: 0,Junior High School,Junior High School Ratings
13729,Actis O. J.,7.0
1,"Actis, O. J.",7.0
13679,Adult Community,
79,Almond Tree,
41,Beardsley,7.0
2719,Caliente,
3333,"Cato, Paul L.",7.0
159,Cecil Avenue,
906,Chipman,7.0
899,Compton,7.0


In [16]:
#Merge high schools
temp = school_ratings_subset_df[school_ratings_subset_df['Grade Classification'] == 'High School']

temp = temp.rename(columns={'Ratings out of 10': "High School Ratings"})

#Print
temp.sort_values('School Name').head()

Unnamed: 0,School Name,Grade Classification,High School Ratings
141,Arvin,High School,4
2,Bakersfield,High School,8
5,Centennial,High School,8
142,East,High School,4
6,Foothill,High School,7


In [17]:
#Merge
merged_df = cudf.merge(merged_df, temp, left_on=['High School'], right_on=['School Name'], how='left', suffixes=('', ''))

#Print
merged_df[[
    'High School',
    'High School Ratings'
]].drop_duplicates(subset=['High School']).sort_values('High School').head(10)

Unnamed: 0,High School,High School Ratings
6783,Adult Community,
2673,Arvin,4.0
5,Bakersfield,8.0
17,Centennial,8.0
239,Cesar Chavez,
639,Del Oro,
15,Delano,
170,East,4.0
183,Foothill,7.0
6,Frontier,7.0


In [18]:
#Drop columns
merged_df.drop([    
    'School Name',
    'Grade Classification'
], axis=1, inplace=True)

<div style="font-weight: bold; font-size: large; color: blue">1e. Sex Offender Data:</div>

In [19]:
#Read
sex_offender_df = cudf.read_csv("Datasets/sex_offender_data.csv")

#Trim
sex_offender_df['Zipcode'] = sex_offender_df['Zipcode'].astype(str).str.strip()

#Rename column
sex_offender_df = sex_offender_df.rename(columns={'Count': "Sex Offender Count"})

#Print
sex_offender_df.sort_values('Zipcode').head()

Unnamed: 0,Zipcode,Sex Offender Count
10,93301,79
9,93302,1
12,93303,0
3,93304,123
7,93305,127


In [20]:
#Merge
merged_df = cudf.merge(merged_df, sex_offender_df, left_on=['Zipcode'], right_on=['Zipcode'], how='left', suffixes=('', ''))

#Print
merged_df[[
    'Zipcode',
    'Sex Offender Count'
]].drop_duplicates(subset=['Zipcode']).sort_values('Zipcode').head()

Unnamed: 0,Zipcode,Sex Offender Count
15,93215,
11694,9325,
29,93301,79.0
7038,93302,1.0
2031,93303,0.0


<div style="font-weight: bold; font-size: large; color: blue">1f. Driving Distance to the Nearest Shopping Mall and Downtown:</div>

In [21]:
#Read
distance_duration_df = cudf.read_csv("Datasets/distance_duration.csv")

#Set data types
distance_duration_df['ML Number'] = cudf.to_numeric(distance_duration_df['ML Number'])

#Drop columns
distance_duration_df.drop([
    #Google place ID
    "Shopping Mall ID",
    #It's blank
    "Shopping Mall Addr",
    "Shopping Mall Loc Types",
    #It's blank
    "Shopping Mall Loc Type",
    "Shopping Mall Latitude",
    "Shopping Mall Longitude",
    #It's false for all rows
    "Shopping Mall Permanently Closed",
    "Shopping Mall Driving Duration With Traffic in Secs",
    "Downtown Driving Duration With Traffic in Secs"
    
    
], axis=1, inplace=True)

#Rename columns
distance_duration_df = distance_duration_df.rename(columns={
    'Shopping Mall Name': 'Mall Name',
    'Shopping Mall Vicinity': 'Mall Vicinity',
    'Shopping Mall Driving Distance in Meters': 'Mall Distance (Meters)',
    'Shopping Mall Driving Duration Without Traffic in Secs': 'Mall Duration (Secs)',
    'Shopping Mall Usr Ratings Total': 'Mall Ratings Total',
    'Shopping Mall Ratings': 'Mall Ratings',    
    'Downtown Driving Distance in Meters': 'Downtown Distance (Meters)',
    'Downtown Driving Duration Without Traffic in Secs': 'Downtown Duration (Secs)'
})

#Print
distance_duration_df.sort_values('ML Number').head()

Unnamed: 0,ML Number,Mall Name,Mall Vicinity,Mall Distance (Meters),Mall Duration (Secs),Mall Ratings Total,Mall Ratings,Downtown Distance (Meters),Downtown Duration (Secs)
18613,21913768,The Palms Shopping Center,"Panama Lane, Bakersfield",891,151,324,4.2,15717,940
18285,21913793,White Lane Plaza Shopping Center,"3019 Wilson Road, Bakersfield",2461,255,19,4.3,11043,669
18296,21914052,Riverlakes Village,"4420-4580 Coffee Road, Bakersfield",3165,354,0,0.0,8088,657
18139,21914165,Ridgeview Plaza,"3400 Panama Lane R, Bakersfield",5006,569,15,4.3,16221,1040
18272,21914280,Country Club Plaza,"3711 Columbus Street, Bakersfield",522,99,209,4.3,8552,569


In [22]:
#Merge
merged_df = cudf.merge(merged_df, distance_duration_df, left_on=['ML Number'], right_on=['ML Number'], how='left', suffixes=('', ''))

#Print
merged_df[[
    'ML Number',
    'Mall Name',
    'Mall Vicinity',
    'Mall Distance (Meters)',
    'Mall Duration (Secs)',
    'Mall Ratings Total',
    'Mall Ratings',
    'Downtown Distance (Meters)',
    'Downtown Duration (Secs)'
]].sort_values('ML Number').head()

Unnamed: 0,ML Number,Mall Name,Mall Vicinity,Mall Distance (Meters),Mall Duration (Secs),Mall Ratings Total,Mall Ratings,Downtown Distance (Meters),Downtown Duration (Secs)
14037,21913768,The Palms Shopping Center,"Panama Lane, Bakersfield",891,151,324,4.2,15717,940
4734,21913793,White Lane Plaza Shopping Center,"3019 Wilson Road, Bakersfield",2461,255,19,4.3,11043,669
9415,21914052,Riverlakes Village,"4420-4580 Coffee Road, Bakersfield",3165,354,0,0.0,8088,657
10454,21914165,Ridgeview Plaza,"3400 Panama Lane R, Bakersfield",5006,569,15,4.3,16221,1040
4722,21914280,Country Club Plaza,"3711 Columbus Street, Bakersfield",522,99,209,4.3,8552,569


<div style="font-weight: bold; font-size: x-large; color: blue">***2. Save***</div>

<div style="font-weight: bold; font-size: large; color: blue">2a. Get Missing Values:</div>

In [23]:
#Print missing values
getMissing(merged_df)

Unnamed: 0,Missing,Missing %
Transit Score,3100,21.314631
Sq Foot,1260,8.663366
Junior High School Ratings,764,5.253025
Elementary School Ratings,331,2.275853
High School Ratings,321,2.207096
Sex Offender Count,255,1.7533
Lot Size,100,0.687569
Bike Score,97,0.666942
Street Number 1,18,0.123762
Cross Street,3,0.020627


<div style="font-weight: bold; font-size: large; color: blue">2c. Set Data Types:</div>

In [24]:
#Print unique values
temp = [
    'DOM',
    'Bedrooms',
    'Bathrooms',
    'Year Built',
    'Walk Score',
    'Transit Score',
    'Bike Score',
    'Elementary School Ratings',
    'Junior High School Ratings',
    'High School Ratings',
    'Sex Offender Count',
    'Mall Name'
]

for c in temp:
    uniqueVals = merged_df[c].unique()

    print(c)
    print(uniqueVals)
    print('\n\n')

DOM
0       64
1       10
2        4
3       21
4       25
      ... 
130    160
131     99
132    119
133    159
134    105
Name: DOM, Length: 135, dtype: int64



Bedrooms
0    4
1    3
2    2
3    5
4    1
5    6
6    7
7    0
8    9
9    8
Name: Bedrooms, dtype: int64



Bathrooms
0     2.00
1     2.50
2     3.00
3     1.00
4     1.75
5     3.50
6     1.50
7     4.00
8     2.75
9     4.50
10    5.50
11    5.00
12    0.00
13    3.75
14    6.00
15    6.50
16    4.75
17    8.00
Name: Bathrooms, dtype: float64



Year Built
0      2013
1      2005
2      1990
3      2018
4      1996
       ... 
123    1914
124    1906
125    1905
126    1909
127    1899
Name: Year Built, Length: 128, dtype: int64



Walk Score
0     42.0
1      8.0
2     65.0
3      6.0
4     17.0
      ... 
84    77.0
85    82.0
86    87.0
87    91.0
88    89.0
Name: Walk Score, Length: 89, dtype: float64



Transit Score
0     16.0
1     18.0
2     28.0
3     <NA>
4     37.0
5     11.0
6     29.0
7     25.0
8     22.

In [25]:
#Numeric
merged_df['ML Number'] = merged_df['ML Number'].astype(str).str.replace(',', '').astype('int')
merged_df['Original Price'] = merged_df['Original Price'].astype(str).str.replace(',', '').astype('float')
merged_df['List Price'] = merged_df['List Price'].astype(str).str.replace(',', '').astype('float')
merged_df['Sold Price'] = merged_df['Sold Price'].astype(str).str.replace(',', '').astype('float')
merged_df['DOM'] = merged_df['DOM'].astype(str).str.replace(',', '').astype('float')
#Treat it as numeric: Although the number of bedrooms is a discrete integer, it represents a natural ordinal relationship. Treating it as a numeric variable allows the model to capture the linear or non-linear impact of additional bedrooms on home value.
merged_df['Bedrooms'] = merged_df['Bedrooms'].astype(str).str.replace(',', '').astype('float')
#Treat it as numeric: Bathrooms include fractional values (e.g., 2.5, 1.75), which suggest they should be treated as numeric. The model can learn that adding a half or a quarter bathroom affects home value in a proportional way.
merged_df['Bathrooms'] = merged_df['Bathrooms'].astype(str).str.replace(',', '').astype('float')
merged_df['Sq Foot'] = merged_df['Sq Foot'].astype(str).str.replace(',', '').astype('float')
merged_df['Lot Size'] = merged_df['Lot Size'].astype(str).str.replace(',', '').astype('float')
merged_df['Year Built'] = merged_df['Year Built'].astype(str).str.replace(',', '').astype('float')
merged_df['Latitude'] = merged_df['Latitude'].astype(str).str.replace(',', '').astype('float')
merged_df['Longitude'] = merged_df['Longitude'].astype(str).str.replace(',', '').astype('float')
merged_df['Avg Monthly Mortgage Rate (30Y)'] = merged_df['Avg Monthly Mortgage Rate (30Y)'].astype(str).str.replace(',', '').astype('float')
merged_df['Avg Monthly Mortgage Rate (15Y)'] = merged_df['Avg Monthly Mortgage Rate (15Y)'].astype(str).str.replace(',', '').astype('float')
merged_df['Walk Score'] = merged_df['Walk Score'].astype(str).str.replace(',', '').astype('float')
merged_df['Transit Score'] = merged_df['Transit Score'].astype(str).str.replace(',', '').astype('float')
merged_df['Bike Score'] = merged_df['Bike Score'].astype(str).str.replace(',', '').astype('float')
#Treat school ratings as numeric: They have a natural order (higher numbers represent better ratings), so they should be treated as numeric variables rather than categorical. Treating them as numeric allows the model to capture linear and non-linear relationships between school quality and home value.
merged_df['Elementary School Ratings'] = merged_df['Elementary School Ratings'].astype(str).str.replace(',', '').astype('float')
merged_df['Junior High School Ratings'] = merged_df['Junior High School Ratings'].astype(str).str.replace(',', '').astype('float')
merged_df['High School Ratings'] = merged_df['High School Ratings'].astype(str).str.replace(',', '').astype('float')
merged_df['Sex Offender Count'] = merged_df['Sex Offender Count'].astype(str).str.replace(',', '').astype('float')
merged_df['Mall Distance (Meters)'] = merged_df['Mall Distance (Meters)'].astype(str).str.replace(',', '').astype('float')
merged_df['Mall Duration (Secs)'] = merged_df['Mall Duration (Secs)'].astype(str).str.replace(',', '').astype('float')
merged_df['Mall Ratings Total'] = merged_df['Mall Ratings Total'].astype(str).str.replace(',', '').astype('float')
merged_df['Mall Ratings'] = merged_df['Mall Ratings'].astype(str).str.replace(',', '').astype('float')
merged_df['Downtown Distance (Meters)'] = merged_df['Downtown Distance (Meters)'].astype(str).str.replace(',', '').astype('float')
merged_df['Downtown Duration (Secs)'] = merged_df['Downtown Duration (Secs)'].astype(str).str.replace(',', '').astype('float')

#Categorical
merged_df['City Name'] = merged_df['City Name'].astype('category')
merged_df['Zipcode'] = merged_df['Zipcode'].astype('category')
merged_df['Area'] = merged_df['Area'].astype('category')
merged_df['Pool_None'] = merged_df['Pool_None'].astype('category')
merged_df['Pool_Inground'] = merged_df['Pool_Inground'].astype('category')
merged_df['Pool_Community'] = merged_df['Pool_Community'].astype('category')
merged_df['Pool_Spa'] = merged_df['Pool_Spa'].astype('category')
merged_df['Pool_Above_Ground'] = merged_df['Pool_Above_Ground'].astype('category')
merged_df['Heating_Central_AC'] = merged_df['Heating_Central_AC'].astype('category')
merged_df['Heating_Central_Heat'] = merged_df['Heating_Central_Heat'].astype('category')
merged_df['Heating_Other'] = merged_df['Heating_Other'].astype('category')
merged_df['Heating_Evaporative'] = merged_df['Heating_Evaporative'].astype('category')
merged_df['Heating_Floor/Wall_Heater'] = merged_df['Heating_Floor/Wall_Heater'].astype('category')
merged_df['Heating_Wood_Burning_Stove'] = merged_df['Heating_Wood_Burning_Stove'].astype('category')
merged_df['Heating_Gas'] = merged_df['Heating_Gas'].astype('category')
merged_df['Heating_Electric'] = merged_df['Heating_Electric'].astype('category')
merged_df['Heating_Propane'] = merged_df['Heating_Propane'].astype('category')
merged_df['Interior_Great_Room'] = merged_df['Interior_Great_Room'].astype('category')
merged_df['Interior_Formal_Dining'] = merged_df['Interior_Formal_Dining'].astype('category')
merged_df['Interior_Breakfast_Area'] = merged_df['Interior_Breakfast_Area'].astype('category')
merged_df['Interior_Indoor_Utility'] = merged_df['Interior_Indoor_Utility'].astype('category')
merged_df['Interior_Split_Wing'] = merged_df['Interior_Split_Wing'].astype('category')
merged_df['Interior_Formal_Living'] = merged_df['Interior_Formal_Living'].astype('category')
merged_df['Interior_Bonus_Room'] = merged_df['Interior_Bonus_Room'].astype('category')
merged_df['Interior_Office'] = merged_df['Interior_Office'].astype('category')
merged_df['Interior_Sep._Family_Room'] = merged_df['Interior_Sep._Family_Room'].astype('category')
merged_df['Interior_Extended_Living'] = merged_df['Interior_Extended_Living'].astype('category')
merged_df['Interior_Handicap'] = merged_df['Interior_Handicap'].astype('category')
merged_df['Interior_Basement'] = merged_df['Interior_Basement'].astype('category')
merged_df['Exterior_Brick'] = merged_df['Exterior_Brick'].astype('category')
merged_df['Exterior_Stucco'] = merged_df['Exterior_Stucco'].astype('category')
merged_df['Exterior_Wood'] = merged_df['Exterior_Wood'].astype('category')
merged_df['Exterior_Other'] = merged_df['Exterior_Other'].astype('category')
merged_df['Exterior_Steel_/_Metal'] = merged_df['Exterior_Steel_/_Metal'].astype('category')
merged_df['Exterior_Brick_Veneer'] = merged_df['Exterior_Brick_Veneer'].astype('category')
merged_df['Exterior_Wood_Frame'] = merged_df['Exterior_Wood_Frame'].astype('category')
merged_df['Other Features_Horse'] = merged_df['Other Features_Horse'].astype('category')
merged_df['Other Features_Alley'] = merged_df['Other Features_Alley'].astype('category')
merged_df['Other Features_Cul_De_Sac'] = merged_df['Other Features_Cul_De_Sac'].astype('category')
merged_df['Other Features_Corner'] = merged_df['Other Features_Corner'].astype('category')
merged_df['Other Features_Gated_Community'] = merged_df['Other Features_Gated_Community'].astype('category')
merged_df['Other Features_Adult_Community'] = merged_df['Other Features_Adult_Community'].astype('category')
merged_df['Other Features_Mountain'] = merged_df['Other Features_Mountain'].astype('category')
merged_df['Other Features_River'] = merged_df['Other Features_River'].astype('category')
merged_df['Other Features_Lake'] = merged_df['Other Features_Lake'].astype('category')
merged_df['Other Features_Golf_Course'] = merged_df['Other Features_Golf_Course'].astype('category')
merged_df['Other Features_Truck_Door'] = merged_df['Other Features_Truck_Door'].astype('category')
merged_df['Other Features_Partial_Fenced'] = merged_df['Other Features_Partial_Fenced'].astype('category')
merged_df['Other Features_Overhead_Door'] = merged_df['Other Features_Overhead_Door'].astype('category')
merged_df['Other Features_Additional_Buildings'] = merged_df['Other Features_Additional_Buildings'].astype('category')
merged_df['Other Features_Wet_Sprinklers'] = merged_df['Other Features_Wet_Sprinklers'].astype('category')
merged_df['Other Features_Security_Fence'] = merged_df['Other Features_Security_Fence'].astype('category')
merged_df['Other Features_Burglar_Alarm'] = merged_df['Other Features_Burglar_Alarm'].astype('category')
merged_df['Other Features_Smoke/Fire_Alarm'] = merged_df['Other Features_Smoke/Fire_Alarm'].astype('category')
merged_df['Other Features_Security_Lighting'] = merged_df['Other Features_Security_Lighting'].astype('category')
merged_df['Other Features_Sign'] = merged_df['Other Features_Sign'].astype('category')
merged_df['Other Features_Display_Window'] = merged_df['Other Features_Display_Window'].astype('category')
merged_df['Other Features_Laundry_Hookup'] = merged_df['Other Features_Laundry_Hookup'].astype('category')
merged_df['Other Features_Outside_Storage'] = merged_df['Other Features_Outside_Storage'].astype('category')
merged_df['Other Features_ADA_Compliant'] = merged_df['Other Features_ADA_Compliant'].astype('category')

#Date & time
merged_df['List Date'] =  cudf.to_datetime(merged_df['List Date'], format='%m/%d/%y')
merged_df['Sold Date'] = cudf.to_datetime(merged_df['Sold Date'], format='%Y-%m-%d')

#String
merged_df['Street Number 1'] = merged_df['Street Number 1'].astype('str')
merged_df['Street Name'] = merged_df['Street Name'].astype('str')
merged_df['Cross Street'] = merged_df['Cross Street'].astype('str')

merged_df['Elementry School'] = merged_df['Elementry School'].astype('str')
merged_df['Junior High School'] = merged_df['Junior High School'].astype('str')
merged_df['High School'] = merged_df['High School'].astype('str')

merged_df['Mall Name'] = merged_df['Mall Name'].astype('str')
merged_df['Mall Vicinity'] = merged_df['Mall Vicinity'].astype('str')

#Print data types
merged_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 14544 entries, 0 to 14543
Data columns (total 97 columns):
 #   Column                               Non-Null Count  Dtype
---  ------                               --------------  -----
 0   ML Number                            14544 non-null  int64
 1   List Price                           14544 non-null  float64
 2   Sold Price                           14544 non-null  float64
 3   List Date                            14544 non-null  datetime64[ns]
 4   Sold Date                            14544 non-null  datetime64[ns]
 5   DOM                                  14544 non-null  float64
 6   City Name                            14544 non-null  category
 7   Zipcode                              14544 non-null  category
 8   Area                                 14544 non-null  category
 9   Bedrooms                             14544 non-null  float64
 10  Bathrooms                            14544 non-null  float64
 11  Sq Foot        

In [26]:
#Print values
pd.set_option('display.max_columns', None)

merged_df.head(10)

Unnamed: 0,ML Number,List Price,Sold Price,List Date,Sold Date,DOM,City Name,Zipcode,Area,Bedrooms,Bathrooms,Sq Foot,Lot Size,Elementry School,Junior High School,High School,Cross Street,Latitude,Longitude,Street Name,Street Number 1,Other Features_Laundry_Hookup,Other Features_Outside_Storage,Other Features_ADA_Compliant,Avg Monthly Mortgage Rate (30Y),Avg Monthly Mortgage Rate (15Y),Walk Score,Transit Score,Bike Score,Elementary School Ratings,Junior High School Ratings,High School Ratings,Sex Offender Count,Mall Name,Mall Vicinity,Mall Distance (Meters),Mall Duration (Secs),Mall Ratings Total,Mall Ratings,Downtown Distance (Meters),Downtown Duration (Secs)
0,202210307,459000.0,460000.0,2022-09-14,2022-12-28,64.0,Bakersfield,93311,52,4.0,2.0,1881.0,8712.0,"Lum, Sing",Tevis,Independence,Harris,35.302588,-119.102646,Five Burroughs,9009,0,0,0,6.364,5.668,42.0,16.0,31.0,5.0,3.0,6.0,43.0,Gosford Village,"Gosford Road, Bakersfield",1421.0,228.0,2207.0,4.4,14708.0,1153.0
1,202210311,469990.0,469990.0,2022-09-16,2022-11-04,10.0,Bakersfield,93312,62,4.0,2.0,2306.0,9583.0,Norris,Norris,Frontier,Olive Dr/Old Farm,35.422352,-119.140954,Rangeview,6808,0,0,0,6.805,6.1375,8.0,18.0,33.0,4.0,4.0,7.0,34.0,RiverLakes Galleria,"5423 Calloway Drive, Bakersfield",3666.0,385.0,254.0,4.4,14991.0,1168.0
2,202210375,335000.0,340000.0,2022-09-15,2022-10-31,4.0,Bakersfield,93313,52,3.0,2.5,1583.0,8276.0,Berkshire,Stonecreek,Ridgeview,Akers Rd,35.295692,-119.05256,Summerfield,6500,0,0,0,6.9,6.145,65.0,28.0,41.0,8.0,3.0,4.0,59.0,Bella VIsta,"4550 Panama Lane, Bakersfield",254.0,138.0,8.0,4.3,14725.0,863.0
3,202210317,624999.0,610000.0,2022-09-16,2022-10-31,21.0,Bakersfield,93314,63,4.0,3.0,2643.0,9147.0,Patriot,Freedom,Frontier,Hageman/ Renfro,35.399688,-119.16735,Marjoram,14836,0,0,0,6.9,6.145,6.0,,25.0,4.0,6.0,7.0,22.0,Rosedale Gateway Plaza,"13125 Rosedale Highway, Bakersfield",3584.0,314.0,66.0,4.2,18873.0,1117.0
4,202209338,549900.0,550000.0,2022-08-24,2022-10-11,21.0,Bakersfield,93314,63,3.0,2.0,1811.0,23086.0,Centennial,Rosedale,Liberty,Palm Ave,35.376506,-119.157936,Willow Brook,1900,0,0,0,6.9,6.145,17.0,,29.0,7.0,4.0,5.0,22.0,Rosedale Gateway Plaza,"13125 Rosedale Highway, Bakersfield",1757.0,179.0,66.0,4.2,15431.0,898.0
5,202210151,225000.0,225000.0,2022-09-09,2022-10-21,25.0,Bakersfield,93313,52,3.0,1.0,1052.0,3920.0,"Loudon, Roy W.","Thompson, F. L.",Ridgeview,Harris,35.302978,-119.052338,Country Wood,4500,0,0,0,6.9,6.145,52.0,28.0,38.0,5.0,2.0,4.0,59.0,Bella VIsta,"4550 Panama Lane, Bakersfield",2029.0,234.0,8.0,4.3,13494.0,885.0
6,202210123,260000.0,230000.0,2022-09-07,2022-09-30,2.0,Bakersfield,93305,31,3.0,2.0,1459.0,6534.0,"Noble, Myra A.",Washington,East,University,35.402858,-118.98159,Bucknell,3213,0,0,0,6.112,5.35,49.0,37.0,38.0,4.0,2.0,4.0,127.0,Columbus Square Center,"1009 Columbus Street, Bakersfield",1114.0,168.0,61.0,4.1,6407.0,573.0
7,202210382,399900.0,399900.0,2022-09-16,2022-10-26,10.0,Bakersfield,93306,33,3.0,2.0,1834.0,8712.0,"Chavez, C. E.",Sierra,Foothill,Daytona Dr,35.388142,-118.8815,Pocono,11009,0,0,0,6.9,6.145,3.0,11.0,12.0,7.0,3.0,7.0,110.0,Lowrider Lifestyle Swapmeet,"8331 Kern Canyon Road, Bakersfield",3799.0,296.0,309.0,3.5,15393.0,854.0
8,202210318,350000.0,355000.0,2022-09-15,2022-12-06,43.0,Bakersfield,93313,52,3.0,2.0,1600.0,7405.0,"Loudon, Roy W.","Thompson, F. L.",Ridgeview,Pacheco,35.308726,-119.049286,Thatch,4305,0,0,0,6.364,5.668,41.0,29.0,35.0,5.0,2.0,4.0,59.0,Bella VIsta,"4550 Panama Lane, Bakersfield",2156.0,226.0,8.0,4.3,12342.0,782.0
9,202210226,539900.0,533000.0,2022-09-14,2022-11-22,27.0,Bakersfield,93311,53,4.0,2.5,2694.0,12632.0,"McAuliffe, C.",Tevis,Stockdale,Westwold Drive,35.329692,-119.100128,Slender Oak,2609,0,0,0,6.805,6.1375,18.0,25.0,29.0,5.0,3.0,4.0,43.0,The Marketplace,"9000 Ming Avenue, Bakersfield",3076.0,352.0,2740.0,4.4,11469.0,905.0


<div style="font-weight: bold; font-size: large; color: blue">2d. Save:</div>

In [27]:
#Save data frame
merged_df.to_feather("Datasets/Merged.feather")



In [28]:
#Save sample
merged_df.head(200).to_csv("Temp/Merged_Sample.csv")

In [29]:
#Test
cudf.read_feather("Datasets/Merged.feather").info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 14544 entries, 0 to 14543
Data columns (total 97 columns):
 #   Column                               Non-Null Count  Dtype
---  ------                               --------------  -----
 0   ML Number                            14544 non-null  int64
 1   List Price                           14544 non-null  float64
 2   Sold Price                           14544 non-null  float64
 3   List Date                            14544 non-null  datetime64[ns]
 4   Sold Date                            14544 non-null  datetime64[ns]
 5   DOM                                  14544 non-null  float64
 6   City Name                            14544 non-null  category
 7   Zipcode                              14544 non-null  category
 8   Area                                 14544 non-null  category
 9   Bedrooms                             14544 non-null  float64
 10  Bathrooms                            14544 non-null  float64
 11  Sq Foot        

