# Tableau Data Set
- need special data set for tableau dashboard
- will need to intergrate the long/lat we did prior
- 2 tabs will be used for interactive EDA and visualization
- 1 tab will be dedicated for prediction analysis

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from data_preprocessing_final2 import *

In [3]:
#set columns
pd.set_option("display.max_columns",None)

#set rows
pd.set_option("display.max_rows",None)

### Read our 2 data files we'll use to combine the long/lat and the address

In [4]:
# read our original data file
df = pd.read_csv('./data/Ames_Housing_Price_Data.csv', index_col=0)

# read the map file
map_df = pd.read_csv('./data/housinglatlong.csv', index_col=0)

In [5]:
# taking only Normal Sale conditions to make apples to apples comparison for Sales Price (majority were Normal)
df = df[df['SaleCondition'] == 'Normal']

### Cleaning the main housing data file
- using our data_preprocessing function we created

In [6]:
df = cleaning(df)

In [7]:
df = dummify(df)

### Will merge on PID
- Take "Coordinates", "Lat", "Lon" from the map_df
- Merge into the housing data

In [8]:
map_df

Unnamed: 0,SalePrice,PID,Address,Coordinates,Lat,Lon
0,126000,909176150,436 HAYWARD AVE Ames Iowa,"436 Hayward Ave, Ames, IA 50014, USA",42.017780,-93.651452
1,139500,905476230,3416 WEST ST Ames Iowa,"3416 West St, Ames, IA 50014, USA",42.024697,-93.664186
3,124900,911128020,320 S 2ND ST Ames Iowa,"320 S 2nd St, Ames, IA 50010, USA",42.021389,-93.614855
4,114000,535377150,1524 DOUGLAS AVE Ames Iowa,"1524 Douglas Ave, Ames, IA 50010, USA",42.038070,-93.612065
5,227000,534177230,2304 FILLMORE AVE Ames Iowa,"2304 Fillmore Ave, Ames, IA 50010, USA",42.044900,-93.631893
...,...,...,...,...,...,...
2598,121000,903205040,1021 RIDGEWOOD AVE Ames Iowa,"1021 Ridgewood Ave, Ames, IA 50010, USA",42.031937,-93.626510
2599,139600,905402060,3619 MARY CIR Ames Iowa,"3619 Mary Cir, Ames, IA 50014, USA",42.027798,-93.666899
2600,145000,909275030,2140 SUNSET DR 2142 Ames Iowa,"2140 Sunset Dr, Ames, IA 50014, USA",42.019944,-93.643206
2601,217500,907192040,5319 CLEMENS BLVD Ames Iowa,"5319 Clemens Blvd, Ames, IA 50014, USA",42.016826,-93.690382


In [9]:
# create a subset of the columns we want to merge
map_df = map_df[['PID', 'Coordinates','Lat','Lon']]

# now merge on housing by PID and MapRefNo
df = df.merge(map_df, how ='left', on='PID')

In [10]:
## let's check our new merged dataframe to make sure it worked
df.columns

Index(['PID', 'SalePrice', 'LotFrontage', 'LotArea', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BedroomAbvGr', 'TotRmsAbvGrd',
       'Fireplaces',
       ...
       'MoSold_7', 'MoSold_8', 'MoSold_9', 'YrSold_2007', 'YrSold_2008',
       'YrSold_2009', 'YrSold_2010', 'Coordinates', 'Lat', 'Lon'],
      dtype='object', length=240)

### Checking & Removing Duplicates

In [20]:
# checking for duplicate PIDs
df.PID.duplicated().sum()

0

In [21]:
df.shape

(2390, 240)

### Checking for empty addresses
- Crossed checked with the Ames_Real_Estate_Data (PID is not listed there)
- Because of that, removing these rows
- Will cause issues with our dashboard

In [13]:
## Checking for NA addresses and coordinates
df[df['Coordinates'].isnull()]

Unnamed: 0,PID,SalePrice,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,...,MoSold_7,MoSold_8,MoSold_9,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010,Coordinates,Lat,Lon
61,531477050,67500,70.0,9800,1920,1950,0.0,2,3,0,...,0,0,0,0,0,0,1,,,
89,916253320,330000,68.217524,9763,1998,1998,239.0,1,4,1,...,0,0,1,1,0,0,0,,,
336,535300120,176000,120.0,19296,1962,1962,399.0,3,3,1,...,0,0,0,0,0,1,0,,,
410,902205010,45000,50.0,5925,1940,1950,0.0,1,3,0,...,0,0,0,0,0,1,0,,,
558,916252170,230000,68.217524,8239,1986,1986,0.0,2,3,0,...,0,0,0,0,0,0,0,,,
741,535426150,113500,80.0,9000,1958,1958,82.0,2,3,1,...,0,0,0,0,0,1,0,,,
921,904101170,134432,80.0,17120,1959,1959,0.0,4,3,1,...,1,0,0,0,1,0,0,,,
1179,902401130,143000,50.0,9000,1920,1988,0.0,3,4,0,...,1,0,0,0,0,1,0,,,
1349,916477060,279900,68.217524,11800,2003,2004,94.0,5,5,1,...,1,0,0,1,0,0,0,,,
1411,911175360,108000,60.0,11040,1920,1950,0.0,3,4,0,...,0,0,1,0,0,0,0,,,


In [14]:
## Removing these
df = df[~df['Coordinates'].isnull()]

### Let's save our new dataframe to use in Tableau
- will name it "housing.csv"

In [15]:
# create new cleaned .csv file
df.to_csv('housing.csv')

### Scaling it manually
- create new file scaled so we can import into Tableau
- need it scaled so I can predict in Tableau

In [59]:
# Create 2 copies
# scale_trainer = Train Scale
# dataframe = Actual Scaling
scale_trainer = df
dataframe = df

In [60]:
# remove lat,lon,coordinates from dataframe
# don't need them for predicting on our model since it's not fitted for it
dataframe = dataframe.drop(['Lat','Lon','Coordinates'], axis=1)
scale_trainer = scale_trainer.drop(['Lat','Lon','Coordinates'], axis=1)

scale_trainer_num = scale_trainer.select_dtypes(['int64', 'float64']) # Select numeric data types
scale_trainer_num = scale_trainer_num.drop(['PID', 'SalePrice'], axis = 1) ## Drop PID and saleprice 

scaler = MinMaxScaler()
scaler.fit(scale_trainer_num)

MinMaxScaler()

In [61]:
dataframe = dataframe.reset_index() #duplicated index values in csv need to reset
dataframe = dataframe.drop('index', axis = 1) # drop original index with duplicates
dataframe_num = dataframe[['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
                           'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageArea', 
                           'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 
                           'TotalBath', 'Bsmt_Unfin_Ratio', 'TotalLivArea']] # Select columns that were used to train the scaler

## Drop the original columns from the main dataframe
dataframe.drop(columns = dataframe_num.columns, axis = 1, inplace =True)

## Scale the new columns and make dataframe
num_scaled = scaler.transform(dataframe_num)
dataframe_num_scaled = pd.DataFrame(num_scaled, columns = dataframe_num.columns)

## Merge back into old dataframe
dataframe = pd.concat([dataframe, dataframe_num_scaled], axis=1)

In [62]:
# create new tableau predict .csv file
dataframe.to_csv('predict_features.csv')