In [None]:
!pip install category_encoders



In [None]:
import pandas as pd
import numpy as np
import category_encoders as ce


# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

In [None]:
raw_data_link = 'https://raw.githubusercontent.com/elvanselvano/purwadhika-final-project/main/dataset/dev2.csv'
df = pd.read_csv(raw_data_link)

In [None]:
#casting data types
def casting(df):
  df['NUM_UNITS'] = df['NUM_UNITS'].astype(pd.Int64Dtype())
  df['AYB'] = df['AYB'].astype(pd.Int64Dtype())
  df['LANDAREA'] = df['LANDAREA'].astype(float)
  df['KITCHENS'] = df['KITCHENS'].astype(pd.Int64Dtype())
  df['USECODE'] = df['USECODE'].astype(pd.Int64Dtype()).astype('category')
  df['ZIPCODE'] = df['ZIPCODE'].astype(pd.Int64Dtype()).astype('category')
  df['CMPLX_NUM'] = df['CMPLX_NUM'].astype(pd.Int64Dtype())
  df['CENSUS_TRACT'] = df['CMPLX_NUM'].astype(pd.Int64Dtype())
  df['SALEDATE'] = pd.to_datetime(df['SALEDATE'])
  return df

In [None]:
def saleyear(df):
  df['SALEYEAR'] = df['SALEDATE'].dt.year
  return df

In [None]:
def drop_qualified(df):
  df = df[df['QUALIFIED']=='Q']
  df = df.drop('QUALIFIED', axis=1)
  
  return df

In [None]:
def residential(df):
  df = df[df['SOURCE']=='Residential']
  
  return df

In [None]:
def drop_unused_features(df):
    df = df.drop(['CMPLX_NUM','LIVING_GBA','CENSUS_TRACT','ASSESSMENT_SUBNBHD',
                   'FULLADDRESS','NATIONALGRID','CENSUS_BLOCK','CITY','STATE',
                   'X','Y','SALEDATE'],axis=1)
    return df

In [None]:
#feature engineering YR_RMDL
def yr_rmdl(df):
  df['RMDL'] = np.where(df['YR_RMDL'].isna(),0,1)
  df = df.drop('YR_RMDL',axis=1)
  return df

In [None]:
#drop remaining missing values
def drop_all(df):
  df = df.dropna()
  return df

In [None]:
def drop_unrelated_features(df):
    df = df.drop(['GIS_LAST_MOD_DTTM','SOURCE'],axis=1)
    return df

In [None]:
#drop unusual values
def drop_unusual(df):
  df['AC'] = np.where(df['AC']=='Y',1,0) #convert to 1 0 
  df = df[df['HEAT']!='No Data'] #delete no data
  df = df[df['ROOMS']!=0] #exclude property with 0 rooms
  df = df[df['AYB']<= df['SALEYEAR']] #exclude values where AYB > SALEYEAR
  df = df[df['AYB']<= df['EYB']] #exclude values where AYB > EYB
  df = df[~df['GRADE'].isin(['Exceptional-D','Exceptional-C','Exceptional-B',
                             'Exceptional-A'])] #exclude exceptional GRADE

  return df

In [None]:
def drop_similar_features(df):
    df = df.drop(['STORIES'], axis=1)
    
    return df

In [None]:
def drop_location_features(df):
    df = df.drop(['ZIPCODE','ASSESSMENT_NBHD','SQUARE','QUADRANT',
                  'LATITUDE','LONGITUDE'], axis=1)
    
    return df

In [None]:
def drop_correlation(df):
    df = df.drop(['NUM_UNITS','BLDG_NUM'], axis=1)
    
    return df

In [None]:
#AYB, EYB change to age, SQFT_ROOMS
def feature_eng(df):
    df['AYB_SALEYEAR_DIFF'] = df['SALEYEAR']-df['AYB']
    df['AYB_SALEYEAR_DIFF'] = df['AYB_SALEYEAR_DIFF'].astype(pd.Int64Dtype())
    
    df['EYB_SALEYEAR_DIFF'] = np.where(df['SALEYEAR']<df['EYB'],0,df['SALEYEAR']-df['EYB'])
    df['EYB_SALEYEAR_DIFF'] = df['EYB_SALEYEAR_DIFF'].astype(pd.Int64Dtype())

    df['SQFT_ROOMS'] = df['GBA']/df['ROOMS']

    df = df.drop(['AYB','EYB'],axis=1)
    return df

In [None]:
def drop_model_evaluation(df):
    df = df.drop(['ROOMS','USECODE'],axis=1)

    return df

In [None]:
#encoding
def encoding_categorical(df):
  ordinal_mapping = [{'col':'CNDTN','mapping':{'Poor':1,'Fair':2,'Average':3,'Good':4,'Very Good':5,'Excellent':6}}]
  ordinal_encoder = ce.OrdinalEncoder(cols ='CNDTN',mapping = ordinal_mapping) 
  df = ordinal_encoder.fit_transform(df)                                                
  return df

In [None]:
df1 = casting(df)
df2 = saleyear(df1)
df3 = drop_qualified(df2)
df4 = residential(df3)
df5 = drop_unused_features(df4)
df6 = yr_rmdl(df5)
df7 = drop_all(df6)
df8 = drop_unrelated_features(df7)
df9 = drop_unusual(df8)
df10 = drop_similar_features(df9)
df11 = drop_location_features(df10)
df12 = drop_correlation(df11)
df13 = feature_eng(df12)
df14 = drop_model_evaluation(df13)
df15 = encoding_categorical(df14)

In [None]:
from google.colab import files
df15.to_csv('dev_clean.csv',index=False)
files.download('dev_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>