In [1]:
#Importing important modules
import numpy as np 
import pandas as pd


In [2]:
#Loading data
play_train = pd.read_csv("train.csv")
play_test = pd.read_csv("test.csv")


In [3]:
#Shape of the data
print(play_train.shape)
print(play_test.shape)

(957919, 120)
(493474, 119)


In [4]:
#Glimpse of the training data
play_train.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
0,0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [5]:
play_train.info(verbose  = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957919 entries, 0 to 957918
Data columns (total 120 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   f1      float64
 2   f2      float64
 3   f3      float64
 4   f4      float64
 5   f5      float64
 6   f6      float64
 7   f7      float64
 8   f8      float64
 9   f9      float64
 10  f10     float64
 11  f11     float64
 12  f12     float64
 13  f13     float64
 14  f14     float64
 15  f15     float64
 16  f16     float64
 17  f17     float64
 18  f18     float64
 19  f19     float64
 20  f20     float64
 21  f21     float64
 22  f22     float64
 23  f23     float64
 24  f24     float64
 25  f25     float64
 26  f26     float64
 27  f27     float64
 28  f28     float64
 29  f29     float64
 30  f30     float64
 31  f31     float64
 32  f32     float64
 33  f33     float64
 34  f34     float64
 35  f35     float64
 36  f36     float64
 37  f37     float64
 38  f38     float64
 39  f39     float64
 40  f

In [6]:
#Are there any missing values 
play_train.isna().values.any() 

True

In [7]:
#Number of missing values by each column in training data
play_train.isna().sum()

id           0
f1       15247
f2       15190
f3       15491
f4       15560
         ...  
f115     15559
f116     15589
f117     15407
f118     15212
claim        0
Length: 120, dtype: int64

In [8]:
#Number of missing values by column in test data
play_test.isna().values.any()

True

In [9]:
play_test.isna().sum()

id         0
f1      7812
f2      7891
f3      7795
f4      7733
        ... 
f114    7942
f115    7977
f116    8083
f117    7763
f118    7885
Length: 119, dtype: int64

In [10]:
#Seperating the target from the training data 
X = play_train.drop(['claim'], axis =1)
y = play_train['claim']



In [11]:
#Splitting training data into training and validation set 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)  
print(X_train.shape, X_val.shape, y_train.shape,y_val.shape)

(766335, 119) (191584, 119) (766335,) (191584,)


In [12]:
#Preparing the test data 
X_test = play_test


In [13]:
#Imputting missing values in training data 
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy= 'mean')
imputer.fit(X_train)
X_train_imputed = imputer.transform(X_train)
X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=X_train.columns)
print(X_train_imputed_df.isna().any())

id      False
f1      False
f2      False
f3      False
f4      False
        ...  
f114    False
f115    False
f116    False
f117    False
f118    False
Length: 119, dtype: bool


In [14]:
#Imputing missing values in validation and test data
X_val_imputed_df = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
X_test_imputed_df = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
print(X_val_imputed_df.isna().any())
print(X_test_imputed_df.isna().any())


id      False
f1      False
f2      False
f3      False
f4      False
        ...  
f114    False
f115    False
f116    False
f117    False
f118    False
Length: 119, dtype: bool
id      False
f1      False
f2      False
f3      False
f4      False
        ...  
f114    False
f115    False
f116    False
f117    False
f118    False
Length: 119, dtype: bool


In [15]:
#Fitting a Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_imputed_df, y_train)
#Making predictions using validation data
y_predict = model.predict(X_valid_imputed_df)


NameError: name 'X_valid_imputed_df' is not defined

In [None]:
from sklearn import metrics 
print('Accuracy: ', metrics.accuracy_score(y_predict, y_val))


Accuracy:  0.5477701686988475


In [None]:
#Fitting an xg boost classifier 
import xgboost as xgb
model_xgb = xgb.XGBClassifier(n_estimators = 100, max_depth = 5) 
model_xgb.fit(X_train, y_train)
#Naking preditions
y_predict_xgb = model_xgb.predict(X_val)
#model accuracy 
print('Accuracy:', metrics.accuracy_score(y_predict_xgb, y_val))




Accuracy: 0.7384384917320862
