In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing

%matplotlib inline

In [None]:
# load data
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.tail()

In [None]:
test_df.head()

In [None]:
train_df['damage_grade'].value_counts(normalize=True)

In [None]:
le_damage_grade = preprocessing.LabelEncoder()
train_df['damage_grade_enc'] = le_damage_grade.fit_transform(train_df['damage_grade'])

In [None]:
all_df = pd.concat([train_df, test_df])
print(all_df.shape)

In [None]:
all_df.head()

# feat eng

In [None]:
structure_df = pd.read_csv("../input/Building_Structure.csv")
structure_df.drop('district_id', axis=1, inplace=True)
structure_df.drop('vdcmun_id', axis=1, inplace=True)
structure_df.drop('ward_id', axis=1, inplace=True)

ownership_df = pd.read_csv("../input/Building_Ownership_Use.csv")
ownership_df.drop('district_id', axis=1, inplace=True)
ownership_df.drop('vdcmun_id', axis=1, inplace=True)
ownership_df.drop('ward_id', axis=1, inplace=True)

In [None]:
all_df = all_df.merge(structure_df, how='left', on='building_id')
all_df = all_df.merge(ownership_df, how='left', on='building_id')

In [None]:
for label in ['area_assesed', 'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'condition_post_eq', 'legal_ownership_status']:
    le = preprocessing.LabelEncoder()
    all_df[label + '_enc'] = le.fit_transform(all_df[label])
    all_df.drop(label, axis=1, inplace=True)

# numeric_features

In [None]:
all_df.columns

In [None]:
all_df.columns[[not np.issubdtype(dt, np.number) for dt in all_df.dtypes]]

In [None]:
all_df['other_floor_type_enc'].value_counts()

In [None]:
numeric_features = all_df.columns[[np.issubdtype(dt, np.number) for dt in all_df.dtypes]]
numeric_features = numeric_features.drop([
    'damage_grade_enc'
])

numeric_features = numeric_features.drop([
    'has_secondary_use_health_post',
    'has_secondary_use_use_police',
    'has_secondary_use_gov_office',
    'has_secondary_use_school',
    'has_secondary_use_industry',
    'has_secondary_use_institution',
    'has_geotechnical_risk_other',
    'has_secondary_use_hotel',
    'has_geotechnical_risk_liquefaction',
    'has_geotechnical_risk_flood',
    'has_secondary_use_rental'
])


In [None]:
numeric_features

In [None]:
cat_features = []
cat_features.append(np.where(numeric_features.values == 'condition_post_eq_enc')[0][0])
cat_features.append(np.where(numeric_features.values == 'area_assesed_enc')[0][0])
cat_features.append(np.where(numeric_features.values == 'other_floor_type_enc')[0][0])
cat_features

# Create Validation Dataset

In [None]:
train_df = all_df[:631761]
test_df = all_df[631761:]

train_f_df = train_df[numeric_features]
test_f_df = test_df[numeric_features]

print(train_df.shape)
print(test_df.shape)

In [None]:
from sklearn.model_selection import train_test_split
from keras.utils import np_utils

train_X, valid_X, train_y, valid_y = train_test_split(train_f_df.as_matrix(), 
                                                      train_df['damage_grade_enc'].as_matrix(),
                                                      test_size=0.20,
                                                      random_state=42)

print(type(train_X))
print(type(valid_X))

print(train_X.shape)
print(valid_X.shape)                                           
print(train_y.shape)
print(valid_y.shape)

# Train

In [None]:
import lightgbm as lgb

In [None]:
lgb_train = lgb.Dataset(train_X, train_y)
lgb_eval = lgb.Dataset(valid_X, valid_y, reference=lgb_train)

In [None]:
# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': {'multi_error'},
        'num_class': 5,
        'learning_rate': 0.08,
        'num_leaves': 64,
        'min_data_in_leaf': 200,
        'num_iteration': 2000,
        'max_depth': 6,
        'cat_feature': cat_features,
        'verbose': 0
}

In [None]:
gbm = lgb.train(params,
            lgb_train,
            valid_sets=lgb_eval,
            early_stopping_rounds=30)

In [None]:
y_predict = gbm.predict(valid_X, num_iteration=gbm.best_iteration)
y_predict = np.argmax(y_predict, axis=1)

In [None]:
from sklearn.metrics import f1_score as f1_score

In [None]:
print(f1_score(valid_y, y_predict, average='weighted'))

# 0.7617929010528087 0.72905


In [None]:
pd.Series(y_predict.flatten()).hist()

# Importance

# Criar CSV para submissão

In [None]:
test_f_df.shape

In [None]:
test_f_df.head()

In [None]:
y_predict = gbm.predict(test_f_df.as_matrix(), num_iteration=gbm.best_iteration)
y_predict = np.argmax(y_predict, axis=1)

In [None]:
y_predict_dec = le_damage_grade.inverse_transform(y_predict.astype(int))

In [None]:
pd.Series(y_predict.flatten()).hist()

In [None]:
sample_submission_df = pd.DataFrame()

sample_submission_df['building_id'] = test_df['building_id']
sample_submission_df['damage_grade'] = y_predict_dec
sample_submission_df.head()

In [None]:
sample_submission_df.to_csv('submission.csv', index=False)