# Lectura de archivos

In [1]:
%matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
id = '1FAjcexe-71nGuYIzvnQ46IdXVcqM9cx4'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test_values_complete_features.csv')

test_values1 = pd.read_csv('test_values_complete_features.csv', encoding='latin-1', index_col='building_id')

test_values1[test_values1.select_dtypes('O').columns] = test_values1[test_values1.select_dtypes('O').columns].astype('category')

In [4]:
id = '1qs2mEnkqiAqebJE2SvqkrfoV66Edguwr'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train_values_complete_features.csv')

train_values1 = pd.read_csv('train_values_complete_features.csv', encoding='latin-1', index_col='building_id')

train_values1[train_values1.select_dtypes('O').columns] = train_values1[train_values1.select_dtypes('O').columns].astype('category')

In [5]:
id='1RUtolRcQlR3RGULttM4ZoQaK_Ouow4gc'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train_labels.csv')
train_labels = pd.read_csv('train_labels.csv', encoding='latin-1', dtype={'building_id': 'int64', 'damage_grade': 'int64'}, index_col='building_id')

In [6]:
id='1br3fMwXX_J0XmiXvOm_wfKWvHSj45T3y'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train_values.csv')
train_values2 = pd.read_csv('train_values.csv', encoding='latin-1', index_col='building_id')

train_values2[train_values2.select_dtypes('O').columns] = train_values2[train_values2.select_dtypes('O').columns].astype('category')

In [7]:
id = '1kt2VFhgpfRS72wtBOBy1KDat9LanfMZU'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test_values.csv')
test_values2 = pd.read_csv('test_values.csv', encoding='latin-1', index_col='building_id')

test_values2[test_values2.select_dtypes('O').columns] = test_values2[test_values2.select_dtypes('O').columns].astype('category')

In [8]:
train_values_complete = train_values1.copy()
test_values_complete = test_values1.copy()
train_values_incomplete = train_values2.copy()
test_values_incomplete = test_values2.copy()
aux = train_labels.copy()

# Logistic Regression

In [9]:
train_values = train_values_incomplete.copy()
test_values = test_values_incomplete.copy()
train_labels = aux.copy()

In [10]:
cat_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'land_surface_condition', 'foundation_type', 'roof_type', 
                    'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
                    
num_features = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

In [None]:
id='18b8OQ-P3sMu3VvwBEhNoP-uH-DIPaKsn'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('one_hot_encoder.py')

from one_hot_encoder import encoder

In [None]:
train_values = encoder(train_values)

In [None]:
#remuevo outliers
from sklearn.neighbors import LocalOutlierFactor
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(train_values)
mask = yhat != -1

In [None]:
train_values, train_labels = train_values.loc[mask, :], train_labels.loc[mask]

In [None]:
len(train_values.index.tolist())

234541

In [None]:
idx_outliers = train_values.index.tolist()
train_values = train_values_incomplete.loc[idx_outliers, :].copy()
train_labels = aux.loc[idx_outliers, :].copy()

In [None]:
idx = train_values.shape[0]
data_df = pd.concat([train_values, test_values], sort=False)

data_cat = pd.DataFrame(index = data_df.index, 
                  data = data_df, 
                  columns = cat_features)

data_num = data_df.drop(columns = cat_features)

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(data_cat)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [None]:
data_cat_encoded = enc.transform(data_cat)

In [None]:
type(data_cat_encoded)

scipy.sparse.csr.csr_matrix

In [None]:
type(data_num)

pandas.core.frame.DataFrame

In [None]:
from scipy.sparse import coo_matrix, hstack
data = hstack((data_cat_encoded,data_num))

In [None]:
data = data.astype(dtype='float16')
X_train = data.tocsr()[:idx]
X_test = data.tocsr()[idx:]

In [None]:
y_train = train_labels['damage_grade']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C=1.0, random_state=42, solver='liblinear', max_iter=500)
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
from sklearn.metrics import f1_score

y_pred = log_reg.predict(X_valid)
f1_score(y_valid, y_pred, average='micro')

0.7423095781193374

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
predicted_df = pd.DataFrame(y_pred.astype(np.int8), index = test_values.index, columns=['damage_grade'])
predicted_df.to_csv('submit_log_reg_outliers.csv')

In [None]:
import pickle

In [None]:
pickle.dump(log_reg, open('log_reg_outliers', 'wb'))