In [35]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 13.6MB/s eta 0:00:01[K     |████████▏                       | 20kB 19.3MB/s eta 0:00:01[K     |████████████▏                   | 30kB 13.6MB/s eta 0:00:01[K     |████████████████▎               | 40kB 11.0MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 5.5MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 5.8MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 5.9MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 3.4MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [40]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
from scipy import stats
import category_encoders as ce

# Some styling
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

In [41]:
raw_data_link = 'https://raw.githubusercontent.com/elvanselvano/purwadhika-final-project/main/dataset/dev.csv'
df = pd.read_csv(raw_data_link)

In [42]:
#casting data types
def casting(df):
  df['NUM_UNITS'] = df['NUM_UNITS'].astype(pd.Int64Dtype())
  df['KITCHENS'] = df['KITCHENS'].astype(pd.Int64Dtype())
  df['USECODE'] = df['USECODE'].astype(pd.Int64Dtype()).astype('category')
  df['ZIPCODE'] = df['ZIPCODE'].astype(pd.Int64Dtype()).astype('category')
  df['CMPLX_NUM'] = df['CMPLX_NUM'].astype(pd.Int64Dtype())
  df['CENSUS_TRACT'] = df['CMPLX_NUM'].astype(pd.Int64Dtype())
  df['SALEDATE'] = pd.to_datetime(df['SALEDATE'])
  return df

In [46]:
def drop_qualified(df):
  df = df[df['QUALIFIED']=='Q']
  df = df.drop('QUALIFIED', axis=1)
  
  # print('Before removing unqualified:', df.shape)
  # print('After removing unqualified:', df_q.shape)
  
  return df
  

In [47]:
def saleyear(df):
  df['SALEYEAR'] = df['SALEDATE'].dt.year
  return df

In [48]:
#drop large missing value : CMPLX_NUM, LIVING_GBA, CENSUS_TRACT
def drop_miss_val1(df):
  df_clean = df.drop(['CMPLX_NUM','LIVING_GBA','CENSUS_TRACT',
                        'ASSESSMENT_SUBNBHD','FULLADDRESS','NATIONALGRID',
                        'CENSUS_BLOCK','CITY','STATE','X','Y','QUADRANT',
                        'GIS_LAST_MOD_DTTM','SOURCE','STORIES','ZIPCODE',
                        'ASSESSMENT_NBHD','SQUARE','LONGITUDE','LATITUDE',
                        'ROOMS','SALE_NUM','NUM_UNITS','BLDG_NUM','USECODE'],axis=1)
  return df_clean

In [49]:
#feature engineering YR_RMDL
def yr_rmdl(df):
  df['RMDL'] = np.where(df['YR_RMDL'].isna(),0,1)
  df = df.drop('YR_RMDL',axis=1)
  return df

In [50]:
#dropna
def drop_all(df):
  df = df.dropna()
  return df

In [51]:
#feature engineering AC
def encoding_categorical(df):
  df['AC'] = np.where(df['AC']=='Y',1,0)
  df = df[df['HEAT']!='No Data']

  ordinal_mapping = [{'col':'CNDTN','mapping':{'Poor':1,'Fair':2,'Average':3,'Good':4,'Very Good':5,'Excellent':6}}]
  ordinal_encoder = ce.OrdinalEncoder(cols ='CNDTN',mapping = ordinal_mapping) 
  df = ordinal_encoder.fit_transform(df)                                                
  return df

In [52]:
df1 = casting(df)
df2 = drop_qualified(df1)
df3 = saleyear(df2)
df4 = drop_miss_val1(df3)
df5 = yr_rmdl(df4)
df6 = drop_all(df5)
df7 = encoding_categorical(df6)

In [53]:
df7.shape

(35369, 23)

In [54]:
from google.colab import files
df7.to_csv('dev_clean.csv',index=False)
files.download('dev_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>