In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Aug-2022/train.csv'
file_key_2 = 'Tabular-Playground-Aug-2022/test.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train = train.drop(columns = ['id'], axis = 1)

test = pd.read_csv(file_content_stream_2)
test = test.drop(columns = ['id'], axis = 1)

## Changing labels to dummies
train_dummies = pd.get_dummies(train[['attribute_0']])
train = train.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
train = pd.concat([train, train_dummies], axis = 1)

test_dummies = pd.get_dummies(test[['attribute_0']])
test = test.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
test = pd.concat([test, test_dummies], axis = 1)

## Defining input and target variables
X = train.drop(columns = ['failure'], axis = 1)
Y = train['failure']

In [2]:
train.head()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,attribute_0_material_5,attribute_0_material_7
0,80.1,9,5,7,8,4,18.04,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0,0,1
1,84.89,9,5,14,3,3,18.213,11.54,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0,0,1
2,82.43,9,5,12,1,5,18.057,11.652,16.738,18.24,12.718,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0,0,1
3,101.07,9,5,13,2,6,17.295,11.188,18.576,18.339,12.583,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0,0,1
4,188.06,9,5,9,2,8,19.346,12.95,16.99,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0,0,1


In [3]:
train.shape

(26570, 24)

In [4]:
train.describe()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,attribute_0_material_5,attribute_0_material_7
count,26320.0,26570.0,26570.0,26570.0,26570.0,26570.0,26189.0,26032.0,25894.0,25774.0,25633.0,25522.0,25343.0,25270.0,25102.0,24969.0,24796.0,24696.0,24561.0,24460.0,24286.0,26570.0,26570.0,26570.0
mean,127.826233,6.754046,7.240459,7.415883,8.232518,6.256568,17.791528,11.731988,17.127804,17.510759,11.716624,19.024714,11.430725,16.117711,19.172085,11.702464,15.652904,16.048444,14.995554,16.460727,701.269059,0.212608,0.197591,0.802409
std,39.03002,1.471852,1.456493,4.11669,4.199401,3.309109,1.0012,0.996085,0.996414,0.99598,1.000836,1.008591,0.999137,1.405978,1.520785,1.488838,1.155247,1.491923,1.549226,1.708935,123.304161,0.40916,0.39819,0.39819
min,33.16,5.0,5.0,0.0,0.0,0.0,13.968,8.008,12.073,12.715,7.968,15.217,7.537,9.323,12.461,5.167,10.89,9.14,9.104,9.701,196.787,0.0,0.0,0.0
25%,99.9875,6.0,6.0,4.0,5.0,4.0,17.117,11.051,16.443,16.839,11.045,18.34025,10.757,15.209,18.17,10.703,14.89,15.057,13.957,15.268,618.9615,0.0,0.0,1.0
50%,122.39,6.0,8.0,7.0,8.0,6.0,17.787,11.733,17.132,17.516,11.712,19.021,11.43,16.127,19.2115,11.717,15.6285,16.04,14.969,16.436,701.0245,0.0,0.0,1.0
75%,149.1525,8.0,8.0,10.0,11.0,8.0,18.469,12.41,17.805,18.178,12.391,19.708,12.102,17.025,20.207,12.709,16.374,17.082,16.018,17.628,784.09025,0.0,0.0,1.0
max,385.86,9.0,9.0,29.0,29.0,24.0,21.499,16.484,21.425,21.543,15.419,23.807,15.412,22.479,25.64,17.663,22.713,22.303,21.626,24.094,1312.794,1.0,1.0,1.0


In [5]:
test.head()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,attribute_0_material_5,attribute_0_material_7
0,119.57,6,4,6,9,6,19.305,10.178,17.534,18.168,11.598,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612,1,0
1,113.51,6,4,11,8,0,17.883,11.927,17.228,16.033,11.179,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037,1,0
2,112.16,6,4,8,12,4,18.475,10.481,16.619,18.189,12.126,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995,1,0
3,112.72,6,4,8,11,10,16.518,10.888,15.293,18.592,11.304,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301,1,0
4,208.0,6,4,14,16,8,17.808,12.693,17.678,15.814,13.431,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044,1,0


In [6]:
test.describe()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,attribute_0_material_5,attribute_0_material_7
count,20552.0,20775.0,20775.0,20775.0,20775.0,20775.0,20446.0,20366.0,20267.0,20151.0,20055.0,19929.0,19871.0,19708.0,19639.0,19535.0,19472.0,19335.0,19233.0,19097.0,19035.0,20775.0,20775.0
mean,127.634895,7.733959,6.196823,7.453574,8.962407,6.126931,17.793466,11.727331,17.13808,17.515797,11.710919,19.030765,11.417921,16.123986,18.846571,11.91423,15.736104,16.123958,15.115915,16.636052,701.389816,0.506811,0.493189
std,39.154642,1.308535,1.917478,4.274477,4.3342,3.835881,1.002206,1.006834,1.008714,1.000067,1.001096,1.005401,0.999953,1.565414,1.588642,1.363253,1.357019,1.410569,1.545069,1.643463,130.205829,0.499966,0.499966
min,37.7,6.0,4.0,0.0,0.0,0.0,13.565,7.384,12.215,13.539,7.853,14.885,7.578,9.167,13.127,6.116,9.209,8.415,8.417,10.162,1.671,0.0,0.0
25%,99.47,6.0,4.0,4.0,6.0,3.0,17.119,11.04825,16.457,16.847,11.035,18.351,10.744,15.095,17.714,11.069,14.871,15.238,14.082,15.512,618.7235,0.0,0.0
50%,122.11,7.0,5.0,7.0,9.0,6.0,17.789,11.729,17.132,17.51,11.704,19.04,11.414,16.1095,18.81,11.941,15.734,16.119,15.062,16.706,701.379,1.0,0.0
75%,148.84,9.0,7.0,10.0,12.0,8.0,18.478,12.411,17.8245,18.197,12.385,19.707,12.093,17.156,19.9675,12.791,16.605,17.0015,16.107,17.781,784.8725,1.0,1.0
max,385.57,9.0,9.0,30.0,33.0,28.0,21.389,15.623,21.681,21.183,15.828,23.092,15.091,23.354,24.95,18.962,21.677,23.14,22.097,22.27,1242.786,1.0,1.0


In [7]:
test.shape

(20775, 23)

In [31]:
train['attribute_0'].value_counts()

material_7    21320
material_5     5250
Name: attribute_0, dtype: int64

In [32]:
train['attribute_1'].value_counts()

material_8    10865
material_5    10362
material_6     5343
Name: attribute_1, dtype: int64

In [33]:
test['attribute_0'].value_counts()

material_5    10529
material_7    10246
Name: attribute_0, dtype: int64

In [34]:
test['attribute_1'].value_counts()

material_6    10529
material_5     5228
material_7     5018
Name: attribute_1, dtype: int64

In [21]:
X.head()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,product_code_A,product_code_B,product_code_C,product_code_D,product_code_E,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_8
0,80.1,9,5,7,8,4,18.04,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,1,0,0,0,0,0,1,0,0,1
1,84.89,9,5,14,3,3,18.213,11.54,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,1,0,0,0,0,0,1,0,0,1
2,82.43,9,5,12,1,5,18.057,11.652,16.738,18.24,12.718,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,1,0,0,0,0,0,1,0,0,1
3,101.07,9,5,13,2,6,17.295,11.188,18.576,18.339,12.583,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,1,0,0,0,0,0,1,0,0,1
4,188.06,9,5,9,2,8,19.346,12.95,16.99,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,1,0,0,0,0,0,1,0,0,1


In [22]:
X.shape

(26570, 31)

In [24]:
test.head()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,product_code_F,product_code_G,product_code_H,product_code_I,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_7
0,119.57,6,4,6,9,6,19.305,10.178,17.534,18.168,11.598,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612,1,0,0,0,1,0,0,1,0
1,113.51,6,4,11,8,0,17.883,11.927,17.228,16.033,11.179,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037,1,0,0,0,1,0,0,1,0
2,112.16,6,4,8,12,4,18.475,10.481,16.619,18.189,12.126,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995,1,0,0,0,1,0,0,1,0
3,112.72,6,4,8,11,10,16.518,10.888,15.293,18.592,11.304,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301,1,0,0,0,1,0,0,1,0
4,208.0,6,4,14,16,8,17.808,12.693,17.678,15.814,13.431,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044,1,0,0,0,1,0,0,1,0


In [11]:
train.head()

Unnamed: 0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,11.54,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,11.652,16.738,18.24,12.718,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,11.188,18.576,18.339,12.583,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,12.95,16.99,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [25]:
train['attribute_1'].value_counts()

KeyError: 'attribute_1'

In [3]:
train.describe()

Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
count,26570.0,26320.0,26570.0,26570.0,26570.0,26570.0,26570.0,26189.0,26032.0,25894.0,25774.0,25633.0,25522.0,25343.0,25270.0,25102.0,24969.0,24796.0,24696.0,24561.0,24460.0,24286.0,26570.0
mean,13284.5,127.826233,6.754046,7.240459,7.415883,8.232518,6.256568,17.791528,11.731988,17.127804,17.510759,11.716624,19.024714,11.430725,16.117711,19.172085,11.702464,15.652904,16.048444,14.995554,16.460727,701.269059,0.212608
std,7670.242662,39.03002,1.471852,1.456493,4.11669,4.199401,3.309109,1.0012,0.996085,0.996414,0.99598,1.000836,1.008591,0.999137,1.405978,1.520785,1.488838,1.155247,1.491923,1.549226,1.708935,123.304161,0.40916
min,0.0,33.16,5.0,5.0,0.0,0.0,0.0,13.968,8.008,12.073,12.715,7.968,15.217,7.537,9.323,12.461,5.167,10.89,9.14,9.104,9.701,196.787,0.0
25%,6642.25,99.9875,6.0,6.0,4.0,5.0,4.0,17.117,11.051,16.443,16.839,11.045,18.34025,10.757,15.209,18.17,10.703,14.89,15.057,13.957,15.268,618.9615,0.0
50%,13284.5,122.39,6.0,8.0,7.0,8.0,6.0,17.787,11.733,17.132,17.516,11.712,19.021,11.43,16.127,19.2115,11.717,15.6285,16.04,14.969,16.436,701.0245,0.0
75%,19926.75,149.1525,8.0,8.0,10.0,11.0,8.0,18.469,12.41,17.805,18.178,12.391,19.708,12.102,17.025,20.207,12.709,16.374,17.082,16.018,17.628,784.09025,0.0
max,26569.0,385.86,9.0,9.0,29.0,29.0,24.0,21.499,16.484,21.425,21.543,15.419,23.807,15.412,22.479,25.64,17.663,22.713,22.303,21.626,24.094,1312.794,1.0


In [5]:
train['product_code'].value_counts()

C    5765
E    5343
B    5250
D    5112
A    5100
Name: product_code, dtype: int64

In [6]:
train['attribute_0'].value_counts()

material_7    21320
material_5     5250
Name: attribute_0, dtype: int64

In [7]:
train['attribute_1'].value_counts()

material_8    10865
material_5    10362
material_6     5343
Name: attribute_1, dtype: int64

In [12]:
test.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,26570,F,119.57,material_5,material_6,6,4,6,9,6,19.305,10.178,17.534,18.168,11.598,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
1,26571,F,113.51,material_5,material_6,6,4,11,8,0,17.883,11.927,17.228,16.033,11.179,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
2,26572,F,112.16,material_5,material_6,6,4,8,12,4,18.475,10.481,16.619,18.189,12.126,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
3,26573,F,112.72,material_5,material_6,6,4,8,11,10,16.518,10.888,15.293,18.592,11.304,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
4,26574,F,208.0,material_5,material_6,6,4,14,16,8,17.808,12.693,17.678,15.814,13.431,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


In [14]:
train_dummies = pd.get_dummies(train[['product_code', 'attribute_0', 'attribute_1']])
train_dummies.head()

Unnamed: 0,product_code_A,product_code_B,product_code_C,product_code_D,product_code_E,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_8
0,1,0,0,0,0,0,1,0,0,1
1,1,0,0,0,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,1
3,1,0,0,0,0,0,1,0,0,1
4,1,0,0,0,0,0,1,0,0,1


In [15]:
train_dummies = pd.get_dummies(train[['product_code', 'attribute_0', 'attribute_1']])
train = train.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
train = pd.concat([train, train_dummies], axis = 1)
train.head()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,product_code_A,product_code_B,product_code_C,product_code_D,product_code_E,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_8
0,80.1,9,5,7,8,4,18.04,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0,1,0,0,0,0,0,1,0,0,1
1,84.89,9,5,14,3,3,18.213,11.54,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0,1,0,0,0,0,0,1,0,0,1
2,82.43,9,5,12,1,5,18.057,11.652,16.738,18.24,12.718,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0,1,0,0,0,0,0,1,0,0,1
3,101.07,9,5,13,2,6,17.295,11.188,18.576,18.339,12.583,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0,1,0,0,0,0,0,1,0,0,1
4,188.06,9,5,9,2,8,19.346,12.95,16.99,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0,1,0,0,0,0,0,1,0,0,1


In [19]:
train['failure'].value_counts() / train.shape[0]

0    0.787392
1    0.212608
Name: failure, dtype: float64

In [16]:
test_dummies = pd.get_dummies(test[['product_code', 'attribute_0', 'attribute_1']])
test = test.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
test = pd.concat([test, test_dummies], axis = 1)
test.head()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,product_code_F,product_code_G,product_code_H,product_code_I,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_7
0,119.57,6,4,6,9,6,19.305,10.178,17.534,18.168,11.598,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612,1,0,0,0,1,0,0,1,0
1,113.51,6,4,11,8,0,17.883,11.927,17.228,16.033,11.179,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037,1,0,0,0,1,0,0,1,0
2,112.16,6,4,8,12,4,18.475,10.481,16.619,18.189,12.126,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995,1,0,0,0,1,0,0,1,0
3,112.72,6,4,8,11,10,16.518,10.888,15.293,18.592,11.304,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301,1,0,0,0,1,0,0,1,0
4,208.0,6,4,14,16,8,17.808,12.693,17.678,15.814,13.431,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044,1,0,0,0,1,0,0,1,0
