In [28]:
import pandas as pd 
import numpy as np
import re 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
import pickle 

In [29]:
#Load and inspecting data 
df=pd.read_excel('../data/raw/task_dataset.xlsx')
df.head()

Unnamed: 0,property_address,categories
0,"New Ppid 1Kqttmg7, House, Property Id No.58C57...",houseorplot
1,"Property Of Rs No.12/2,Plot No. 01,House Numbe...",houseorplot
2,"Flat-504,Floor-5 Shanti Vista-Ii Wing C Bldg 9...",flat
3,"Flat-103,Floor-1 Eshamya Florenza Tower L S No...",flat
4,Flat No G 532 I P Extension Patparganj 2Nd Flo...,flat


In [30]:
df.shape

(8936, 2)

In [31]:
df.columns

Index(['property_address', 'categories'], dtype='object')

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8936 entries, 0 to 8935
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   property_address  8936 non-null   object
 1   categories        8936 non-null   object
dtypes: object(2)
memory usage: 139.8+ KB


In [33]:
df.describe()

Unnamed: 0,property_address,categories
count,8936,8936
unique,8607,5
top,Address:Nananarsy No 65 Sasthamangalam Village...,flat
freq,2,3232


In [34]:
df['categories'].unique()

array(['houseorplot', 'flat', 'landparcel', 'others', 'commercial unit'],
      dtype=object)

In [35]:
df.duplicated().sum()

np.int64(329)

In [36]:
#handling Duplicates with df2 after cleaning 
df2=df.drop_duplicates().reset_index(drop=True)
df2.head()

Unnamed: 0,property_address,categories
0,"New Ppid 1Kqttmg7, House, Property Id No.58C57...",houseorplot
1,"Property Of Rs No.12/2,Plot No. 01,House Numbe...",houseorplot
2,"Flat-504,Floor-5 Shanti Vista-Ii Wing C Bldg 9...",flat
3,"Flat-103,Floor-1 Eshamya Florenza Tower L S No...",flat
4,Flat No G 532 I P Extension Patparganj 2Nd Flo...,flat


In [37]:
#chacking for same address with diffrent categories 
duplicate_address=df2[df2.duplicated(subset=['property_address'],keep=False)]
if len(duplicate_address)>0:
    df2=df2.drop_duplicates(subset=['property_address'],keep=False).reset_index(drop=True)

In [38]:
df2.shape

(8607, 2)

In [39]:
#Cleaning Address text 
df2['property_address'] = df2['property_address'].astype(str)

#replacing with space ]
df2['property_address']=df2['property_address'].str.replace(r'[\n\t\r]+',' ',regex=True)

#Removing Multiple spaces 
df2['property_address']=df2['property_address'].str.replace(r'\s+',' ',regex=True)

#strip leading whitespace 
df2['property_address']=df2['property_address'].str.strip()

#lower casing 
df2['property_address']=df2['property_address'].str.lower()

#removing empty address 
df2=df2[df2['property_address'].str.len()>0].reset_index(drop=True)


In [42]:
df2['categories'].unique()

array(['houseorplot', 'flat', 'landparcel', 'others', 'commercial unit'],
      dtype=object)

In [43]:
df2['categories'].value_counts()

categories
flat               3109
houseorplot        2497
others             1193
commercial unit     954
landparcel          854
Name: count, dtype: int64

In [44]:
#Train and validaion split 

X=df2['property_address']
y=df2['categories']

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
print('Train len :',len(X_train),'\nVal len :',len(X_val))
print('Train Distribution :',y_train.value_counts())


Train len : 6885 
Val len : 1722
Train Distribution : categories
flat               2487
houseorplot        1998
others              954
commercial unit     763
landparcel          683
Name: count, dtype: int64


In [46]:
#Encoding labels 
le=LabelEncoder()
y_train_encoded=le.fit_transform(y_train)
y_val_encoded=le.fit_transform(y_val)

with open('../best_model/label_encoder.pkl','wb') as f:
    pickle.dump(le,f)

print(f"Class mapping:{dict(zip(le.classes_,le.transform(le.classes_)))}")

Class mapping:{'commercial unit': np.int64(0), 'flat': np.int64(1), 'houseorplot': np.int64(2), 'landparcel': np.int64(3), 'others': np.int64(4)}


In [51]:
#Saving the processed data as csv 
train_df=pd.DataFrame({'property_address':X_train,'categories':y_train})
val_df=pd.DataFrame({'property_address':X_val,'categories':y_val})

train_df.to_csv('../data/processed/train.csv',index=False)
val_df.to_csv('../data/processed/val.csv',index=False)
print("Data Saved")

Data Saved


In [52]:
#Checking final data before modeling 
print("Final Dataset Size: ",len(df2))
print('\nTrain Size: ',len(X_train))
print('\nValidation Size: ',len(X_train))
print('\nCategory distribution in train :',y_train.value_counts(normalize=True))


Final Dataset Size:  8607

Train Size:  6885

Validation Size:  6885

Category distribution in train : categories
flat               0.361220
houseorplot        0.290196
others             0.138562
commercial unit    0.110821
landparcel         0.099201
Name: proportion, dtype: float64
