In [None]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Aug-2022/train.csv'
file_key_2 = 'Tabular-Playground-Aug-2022/test.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train = train.drop(columns = ['id'], axis = 1)

test = pd.read_csv(file_content_stream_2)
test = test.drop(columns = ['id'], axis = 1)

## Changing labels to dummies
train_dummies = pd.get_dummies(train[['attribute_0']])
train = train.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
train = pd.concat([train, train_dummies], axis = 1)

test_dummies = pd.get_dummies(test[['attribute_0']])
test = test.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
test = pd.concat([test, test_dummies], axis = 1)

## Filling missing values with kNN
knn_imputer = KNNImputer(n_neighbors = 5, weights = 'distance')
train = pd.DataFrame(knn_imputer.fit_transform(train), columns = train.columns)
test = pd.DataFrame(knn_imputer.fit_transform(test), columns = test.columns)

## Engineering features
train['feature_1'] = np.where(train['loading'] < 150, 0, 1)
test['feature_1'] = np.where(test['loading'] < 150, 0, 1)

## Defining input and target variables
X = train[['loading', 'measurement_2', 'measurement_4', 'measurement_5',
           'measurement_6', 'measurement_7', 'measurement_8', 'measurement_15',
           'measurement_17', 'feature_1']]
# X = train.drop(columns = ['failure'], axis = 1)
Y = train['failure']

In [None]:
X.describe()

In [None]:
## Building Random Forest model
RF_md = RandomForestClassifier(n_estimators = 300, max_depth = 3, criterion = 'gini').fit(X, Y)

In [None]:
importance = pd.DataFrame({'feature': X.columns, 'Imp': RF_md.feature_importances_})
importance = importance.sort_values(by = 'Imp', ascending = False)
importance

In [None]:
from sklearn.feature_selection import RFECV

## Running RFE with Random forest
RF_auto_feature = RFECV(estimator = RandomForestClassifier(n_estimators = 300, max_depth = 3), step = 1, scoring = 'roc_auc', min_features_to_select = 10, cv = 3, n_jobs = -1).fit(X, Y)

## Appending results 
X.columns[RF_auto_feature.support_]

In [None]:
X.columns[RF_auto_feature.support_]

In [None]:
X.describe()

In [None]:
test.describe()

In [None]:
## Building the decision tree on the train data-frame
tree_md = DecisionTreeClassifier(max_depth = 3).fit(X, Y)

## Visualizing the decision-tree model 
fig = plt.figure(figsize = (25, 15))
plot_tree(tree_md, feature_names = X.columns, filled = True)

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (18, 12))

sns.boxplot(ax = axes[0], x = 'failure', y = 'loading', hue = 'failure', data = train)
sns.boxplot(ax = axes[1], x = 'failure', y = 'measurement_3', hue = 'failure', data = train)

In [None]:
train.head()

In [None]:
train['attribute_1'].value_counts()

In [None]:
test['attribute_1'].value_counts()

In [None]:
test.head()

In [None]:
test.describe()

In [None]:
## Defining the model 
knn_imputer = KNNImputer(n_neighbors = 5, weights = 'uniform')
X_new = knn_imputer.fit_transform(X)

In [None]:
X_new = pd.DataFrame(X_new)

In [None]:
X_new.head()

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
test.head()

In [None]:
test.describe()

In [None]:
test.shape

In [None]:
train['attribute_0'].value_counts()

In [None]:
train['attribute_1'].value_counts()

In [None]:
test['attribute_0'].value_counts()

In [None]:
test['attribute_1'].value_counts()

In [None]:
X.head()

In [None]:
X.shape

In [None]:
test.head()

In [None]:
train.head()

In [None]:
train['attribute_1'].value_counts()

In [None]:
train.describe()

In [None]:
train['product_code'].value_counts()

In [None]:
train['attribute_0'].value_counts()

In [None]:
train['attribute_1'].value_counts()

In [None]:
test.head()

In [None]:
train_dummies = pd.get_dummies(train[['product_code', 'attribute_0', 'attribute_1']])
train_dummies.head()

In [None]:
train_dummies = pd.get_dummies(train[['product_code', 'attribute_0', 'attribute_1']])
train = train.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
train = pd.concat([train, train_dummies], axis = 1)
train.head()

In [None]:
train['failure'].value_counts() / train.shape[0]

In [None]:
test_dummies = pd.get_dummies(test[['product_code', 'attribute_0', 'attribute_1']])
test = test.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
test = pd.concat([test, test_dummies], axis = 1)
test.head()

# Logistic Regression

In [10]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Aug-2022/train.csv'
file_key_2 = 'Tabular-Playground-Aug-2022/test.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train = train.drop(columns = ['id'], axis = 1)

test = pd.read_csv(file_content_stream_2)
test_id = test['id']
test = test.drop(columns = ['id'], axis = 1)

## Changing labels to dummies
train_dummies = pd.get_dummies(train[['attribute_0']])
train = train.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
train = pd.concat([train, train_dummies], axis = 1)

test_dummies = pd.get_dummies(test[['attribute_0']])
test = test.drop(columns = ['product_code', 'attribute_0', 'attribute_1'], axis = 1)
test = pd.concat([test, test_dummies], axis = 1)

## Filling missing values with kNN
knn_imputer = KNNImputer(n_neighbors = 5, weights = 'distance')
train = pd.DataFrame(knn_imputer.fit_transform(train), columns = train.columns)
test = pd.DataFrame(knn_imputer.fit_transform(test), columns = test.columns)

## Defining input and target variables
X = train.drop(columns = ['failure'], axis = 1)
Y = train['failure']

## Scaling inputs to 0-1
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)

## Defining the hyper-parameter grid
logistic_param_grid = {'penalty': ['l1', 'l2', 'elasticnet'],
                       'C': [0.001, 0.01, 0.1, 1, 10, 100],
                       'solver': ['liblinear', 'sag', 'saga']}

## Performing grid search with 5 folds
logistic_grid_search = GridSearchCV(LogisticRegression(max_iter = 1000), logistic_param_grid, cv = 5, scoring = 'roc_auc', n_jobs = -1, verbose = 1).fit(X, Y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


120 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l

In [11]:
best_params = logistic_grid_search.best_params_
print(best_params)

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [12]:
best_score = logistic_grid_search.best_score_
print('The best area under the ROC cure is:', best_score)

The best area under the ROC cure is: 0.591554902498434


In [13]:
## Extracting the best model
logistic_md = logistic_grid_search.best_estimator_
print(logistic_md)

LogisticRegression(C=1, max_iter=1000, penalty='l1', solver='liblinear')


In [37]:
logit_md = LogisticRegression(C = 1, penalty = 'l1', solver = 'liblinear', max_iter = 1000).fit(X, Y)
coefs =  pd.DataFrame({'feature': X.columns, 'est_coef': abs(logit_md.coef_.flatten())})
coefs = coefs.sort_values(by = 'est_coef', ascending = False)
coefs

Unnamed: 0,feature,est_coef
0,loading,2.668642
21,attribute_0_material_5,1.344234
22,attribute_0_material_7,1.218404
20,measurement_17,0.726453
5,measurement_2,0.347865
7,measurement_4,0.29836
14,measurement_11,0.133324
12,measurement_9,0.130638
17,measurement_14,0.095053
8,measurement_5,0.091406
