In [1]:
# Approach: We will use 6 models each trained on a sample of upsampled data and the final prediction will be mode
# of all 6 models 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
scaler = StandardScaler()
encoder = LabelEncoder()

In [19]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [23]:
from sklearn.metrics import f1_score, plot_confusion_matrix

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from sklearn.model_selection import cross_val_score

In [6]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=150)

In [7]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier()

In [8]:
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression()

In [9]:
from keras.models import Sequential
from keras.layers import Dense

In [10]:
data = pd.read_csv("train_values.csv").drop(columns=["building_id"])
label = pd.read_csv("train_labels.csv")["damage_grade"]

test_data = pd.read_csv("test_values.csv")

### Special Functions

In [11]:
def label_encoder(data):
    for feature in data.columns:
        if data[feature].dtype == np.object:
            data[feature] = encoder.fit_transform(data[feature])
    return data

### Feature Engineering

In [12]:
data["count_floors_pre_eq"] = data["count_floors_pre_eq"].apply(lambda floor: "4" if floor >= 4 else str(floor))
test_data["count_floors_pre_eq"] = \
                                test_data["count_floors_pre_eq"].apply(lambda floor: "4" if floor >= 4 else str(floor))

In [13]:
data["count_families"] = data["count_families"].apply(lambda floor: "4" if floor >= 4 else str(floor))
test_data["count_families"] = \
                            test_data["count_families"].apply(lambda floor: "4" if floor >= 4 else str(floor))

In [14]:
def new_age_variable(age):
    
    if (age <= 10):
        return 1
    elif (age <= 25):
        return 2
    elif (age <= 50):
        return 3
    elif (age <= 100):
        return 4
    else:
        return 5
    
data["age"] = data["age"].apply(new_age_variable)
test_data["age"] = test_data["age"].apply(new_age_variable)

In [15]:
def new_height_variable(height):
    
    if (height <= 4):
        return 1
    elif (height <= 8):
        return 2
    else:
        return 3
    
data["height_percentage"] = data["height_percentage"].apply(new_height_variable)
test_data["height_percentage"] = test_data["height_percentage"].apply(new_height_variable)

In [16]:
def new_area_variable(area):
    
    if (area <= 4):
        return 1
    elif (area <= 10):
        return 2
    else:
        return 3
    
data["area_percentage"] = data["area_percentage"].apply(new_area_variable)
test_data["area_percentage"] = test_data["area_percentage"].apply(new_area_variable)

### Feature Extraction

In [17]:
data["material_used"] = data['has_superstructure_adobe_mud'] + data['has_superstructure_mud_mortar_stone'] + \
                        data['has_superstructure_stone_flag'] + data['has_superstructure_cement_mortar_stone'] + \
                        data['has_superstructure_mud_mortar_brick'] + data['has_superstructure_cement_mortar_brick']+\
                        data['has_superstructure_timber'] + data['has_superstructure_bamboo']

test_data["material_used"] = test_data['has_superstructure_adobe_mud'] + \
                             test_data['has_superstructure_mud_mortar_stone'] + \
                             test_data['has_superstructure_stone_flag'] + \
                             test_data['has_superstructure_cement_mortar_stone'] + \
                             test_data['has_superstructure_mud_mortar_brick'] + \
                             test_data['has_superstructure_cement_mortar_brick']+\
                             test_data['has_superstructure_timber'] + \
                             test_data['has_superstructure_bamboo']

### Feature Preprocessing

In [18]:
data = label_encoder(data)
test_data = label_encoder(test_data)

### Upsampling

In [20]:
upsampled_data, upsampled_label = smote.fit_sample(data, label)

### Model 1

In [28]:
random_forest = RandomForestClassifier(class_weight="balanced")
random_forest.fit(upsampled_data, upsampled_label)

RandomForestClassifier(class_weight='balanced')

In [29]:
prediction_1 = random_forest.predict(data)

In [None]:
### Model 2

In [None]:
### Model 3

In [None]:
### Model 4

In [None]:
### Model 5

In [30]:
f1_score(label, prediction_1, average="micro")

0.9202190321602757