In [6]:
# basic requirements
import pandas as pd

# transformation requirements
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2_contingency

# models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# pipeline requirements
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

### Import Dataset

In [5]:
path = '..\\..\\Datasets\\train.csv.zip'
data = pd.read_csv(path, compression='zip', index_col='Unnamed: 0', low_memory=False)

In [9]:
data.head()

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,...,type_of_reinforcement_concrete,residential_type,no_family_residing,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,floor two,1.0,256 ft^2,22.0,Flat,Bamboo or Timber,Bamboo/Timber Light roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,0.0,0.0,1.0
1,Floor 3,3.0,985 ft^2,18.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Wood Light Roof or Bamboo Heavy Roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0
2,Two Floor,7.0,,14.0,Flat,Mud mortar-Stone/Brick,,Clay,Wood-Mud or Bamboo Mud,Not attached,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0
3,two,18.0,185 ft^2,15.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Wood Light Roof or Bamboo Light Roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,4.0
4,just 2 floor,22.0,290 ft^2,17.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Bamboo or Timber Light roof,Clay,Timber Mud or Bamboo-Mud,Not attached,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,1.0


### Functions

In [7]:
def transform_column(value):
    if isinstance(value, str):
        if 'one' in value.lower() or 'first' in value.lower() or '1' in value:
            return 1
        elif 'two' in value.lower() or 'second' in value.lower() or '2' in value:
            return 2
        elif 'three' in value.lower() or 'third' in value.lower() or '3' in value or '3.00' in value:
            return 3
        elif 'four' in value.lower() or 'fourth' in value.lower() or '4' in value:
            return 4
        elif 'five' in value.lower() or 'fifth' in value.lower() or '5' in value:
            return 5
        else:
            return value
    else:
        return value
    
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2, _, _, _ = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

def transform_use(x) :
    if x == 'Private Use' or x == 'Private' or x == 'Prvt' or x == 'Privste' :
        return 'private'
    elif x == 'Public' or x =='Public Space' or x == 'Public Use' :
        return 'public'
    elif x == 'Unknown' or x == 'Unspecified' or x == 'Other' :
        return 'other'
    elif x == 'Institutional Use' or x == 'Institutionals' or x == 'Institutional' :
        return 'institutional'
            
        

### Custom Column Transformer

In [12]:
class ColumnTransformer(BaseEstimator, TransformerMixin) :
    def fit(self, X, y=None) :
        return self
    def transform(self, X, y=None) :
        # drop unnecessary columns
        X_transformed = X.copy()
        X_transformed = X_transformed.dropna(axis=0).reset_index(drop=True)
        X_transformed = X_transformed.drop(['has_secondary_use', 'no_family_residing'], axis=1)
        # data2['no_family_residing'] = data2['no_family_residing'].apply(lambda x: 0 if x == 'None' else int(float(x)))

        X_transformed['legal_ownership_status'] = X_transformed['legal_ownership_status'].apply(lambda x : transform_use(x))

        X_transformed['floors_before_eq (total)'] = X_transformed['floors_before_eq (total)'].apply(lambda x: transform_column(x))
        X_transformed['floors_before_eq (total)'] = pd.to_numeric(X_transformed['floors_before_eq (total)'])

        X_transformed['plinth_area (ft^2)'] = X_transformed['plinth_area (ft^2)'].apply(lambda x : x.split(" ")[0] if x != "More than 1000 ft^2" else x.split(" ")[-2])
        X_transformed['plinth_area (ft^2)'] = pd.to_numeric(X_transformed['plinth_area (ft^2)'])

        X_transformed['type_of_ground_floor'] = X_transformed['type_of_ground_floor'].str.lower()

        return X_transformed

### Encoder

In [None]:
class Encoder(BaseEstimator, TransformerMixin) :
    def __init__(self, features, drop='if_binary') :
        self.features = features
        self.drop = drop
    def fit(self, X, y=None) :
        self.encoder = OneHotEncoder(drop=self.drop)
        # self.encoder.fit(X[self.features])
        return self
    def transform(self, X) :
        non_encoded = X.drop(self.features, axis=1)
        X_transformed = pd.concat([non_encoded, pd.DataFrame(self.encoder.fit_transform(X[self.features]).toarray(), columns=self.encoder.get_feature_names(self.features))], axis=1)
        return X_transformed

In [13]:
pipe = Pipeline([
    ('Transformer', ColumnTransformer())
])

result = pipe.fit_transform(data)

In [14]:
result.head()

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,...,legal_ownership_status,type_of_reinforcement_concrete,residential_type,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,1,22.0,300,10.0,Moderate slope,Clay Sand Mixed mortar-Stone/Brick,Bamboo/Timber Light roof,mud,Not applicable,Not attached,...,private,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0
1,3,22.0,360,18.0,Flat,Clay mortar-Stone/Brick,Wood Light Roof or Bamboo Heavy Roof,clay,TImber/Bamboo-Mud,Not attached,...,private,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,4.0
2,2,8.0,380,17.0,Flat,Clay mortar-Stone/Brick,Wood Light Roof or Bamboo Light Roof,brick or stone,Timber-Planck,Not attached,...,private,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0
3,2,22.0,250,14.0,Flat,Clay mortar-Stone/Brick,Bamboo/Timber Light roof,clay,TImber/Bamboo-Mud,Not attached,...,private,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0
4,2,7.0,312,13.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Bamboo or Timber Light roof,clay,TImber/Bamboo-Mud,Not attached,...,private,0.0,Non-residential,Non-public,Non-industrial,Non-govermental,available,5.0,2.0,2.0
