***objective***

Recently, there has been an increase in the number of building collapse in Lagos and major cities in Nigeria. Olusola Insurance Company offers a building insurance policy that protects buildings against damages that could be caused by a fire or vandalism, by a flood or storm.
You have been appointed as the Lead Data Analyst to build a predictive model to determine if a building will have an  insurance claim during a certain period or not. 
You will have to predict the probability of having at least one claim over the insured period of the building.

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder  
from sklearn.preprocessing import StandardScaler  
from scipy import sparse
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import xgboost as xgb
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
import random

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#load data
train = pd.read_csv("train_data.csv") #test_data
test = pd.read_csv("test_data.csv")
sample_submission = pd.read_csv("sample_submission.csv")
description_data = pd.read_csv("VariableDescription.csv")

In [3]:
train.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [4]:
test.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310
3,H7493,2014,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321
4,H7494,2016,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321


In [5]:
sample_submission.head() #YearOfObservation, Residential, Building_Painted, Building_Fenced	Garden	Settlement,
#Building_Type	Date_of_Occupancy	NumberOfWindows

Unnamed: 0,Customer Id,Claim
0,H0,1
1,H10000,1
2,H10001,1
3,H10002,1
4,H10003,1


In [7]:
description_data.head()

Unnamed: 0,Variable,Description
0,Customer Id,Identification number for the Policy holder
1,YearOfObservation,year of observation for the insured policy
2,Insured_Period,duration of insurance policy in Olusola Insura...
3,Residential,is the building a residential building or not
4,Building_Painted,"is the building painted or not (N-Painted, V-N..."


In [8]:
#check for missing data
train.isnull().sum()

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  7
Settlement              0
Building Dimension    106
Building_Type           0
Date_of_Occupancy     508
NumberOfWindows         0
Geo_Code              102
Claim                   0
dtype: int64

In [9]:
train.shape

(7160, 14)

In [10]:
test.shape

(3069, 13)

In [11]:
test.isnull().sum()

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  4
Settlement              0
Building Dimension     13
Building_Type           0
Date_of_Occupancy     728
NumberOfWindows         0
Geo_Code               13
dtype: int64

# build prelimnary model

In [12]:
#drop columns with missing data
#missing data Garden, building dimension, data of occupancy, geo_code
train = train[['Customer Id','YearOfObservation','Insured_Period','Residential','Building_Painted',
               'Building_Fenced','Garden','Settlement','Building Dimension','Building_Type',
               'Date_of_Occupancy','NumberOfWindows','Geo_Code']]
test = test[['Customer Id','YearOfObservation','Insured_Period','Residential','Building_Painted',
               'Building_Fenced','Garden','Settlement','Building Dimension','Building_Type',
               'Date_of_Occupancy','NumberOfWindows','Geo_Code']]

In [13]:
#remove customer id from train, data Garden, building dimension, data of occupancy, geo_code
train = train[['YearOfObservation','Insured_Period','Residential','Building_Painted',
               'Building_Fenced','Settlement','Building_Type','NumberOfWindows']]

In [14]:
test_cust_id = test['Customer Id']

In [15]:
#remove data Garden, building dimension, data of occupancy, geo_code from test
test = test[['YearOfObservation','Insured_Period','Residential','Building_Painted',
               'Building_Fenced','Settlement','Building_Type','NumberOfWindows']]

In [None]:
#split into categorical
#split categorical data
train_cat = df[['YearOfObservation', 'Residential', 'Building_Painted', 'Building_Fenced','Settlement','Building_Type','NumberOfWindows']]
train_cat2 = pd.get_dummies(df_cat, columns=['TRAN_STATE','MERCHANT_CATEGORY','MERCHANT_COUNTRY',
                                          'ENTRY_METHOD','TYPE','SOURCE','EXPONENT',
            'IS_CRYPTO','HAS_EMAIL','USER_STATE','KYC','FAILED_SIGN_IN_ATTEMPTS','PHONECODE'], drop_first=False)
train_cat2.reset_index(drop=True, inplace=True)


In [None]:
#one hot encode categorical

In [None]:
#split into cts data

In [None]:
#merge data

In [None]:
#build model

In [None]:
#submission

# Model 2

In [None]:
#Perform EDA

In [None]:
#Fix missing data

In [None]:
#Data engineering

In [None]:
#data modeling

In [None]:
#submission