In [128]:
!pip install -q pandas_profiling

In [53]:
# Start writing code here...from   category_encoders          import *
import numpy as np
import pandas as pd
from   sklearn.compose            import *
from   sklearn.ensemble           import RandomForestClassifier, ExtraTreesClassifier, IsolationForest, GradientBoostingClassifier
from   sklearn.experimental       import enable_iterative_imputer
from   sklearn.impute             import *
from   sklearn.linear_model       import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from   sklearn.metrics            import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score # Evaluation metric 2.0 
from   sklearn.pipeline           import Pipeline
from   sklearn.preprocessing      import *
from   sklearn.tree               import DecisionTreeClassifier, ExtraTreeClassifier
from   sklearn.model_selection    import train_test_split
from   sklearn.model_selection    import RandomizedSearchCV
from   sklearn.svm                import LinearSVC
from   sklearn.base               import BaseEstimator
from   sklearn.decomposition      import PCA
from   sklearn.model_selection    import cross_val_score, cross_val_predict
from   sklearn.cluster            import KMeans
from   sklearn.metrics            import plot_confusion_matrix
from   sklearn.ensemble           import StackingClassifier
from   sklearn.pipeline           import make_pipeline
from   pandas_profiling           import ProfileReport



In [84]:
# Load train data
data = pd.read_csv('train_ml2_2021.csv')
y = data['target']
X = data.drop(['target'], axis=1)

In [131]:
# Load test data
test = pd.read_csv('test0.csv')
X_test = test.drop(['target', 'obs_id'], axis=1)

In [151]:
y_submission = test['obs_id']

In [86]:
X.shape, y.shape

((8302, 979), (8302,))

In [87]:
# no null values
X.isnull().values.any(), y.isnull().values.any()

(False, False)

In [88]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.75)

In [89]:
# Selected high potential algorithms with default settings
algos = [RandomForestClassifier(),
        ExtraTreesClassifier(),
        # GradientBoostingClassifier(),
        LogisticRegression(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        SGDClassifier(),
        DecisionTreeClassifier(),
        ExtraTreeClassifier()]

In [90]:
for algo in algos:
    pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), 
                     ('scaler', StandardScaler()),
                    ('algo', algo)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    accuracy = accuracy_score(y_pred, y_val)
    print(f"{algo} : {accuracy}")

RandomForestClassifier() : 0.6936416184971098
ExtraTreesClassifier() : 0.6897880539499036
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
LogisticRegression() : 0.5529865125240848
PassiveAggressiveClassifier() : 0.5544315992292871
RidgeClassifier() : 0.5939306358381503
SGDClassifier() : 0.5616570327552987
DecisionTreeClassifier() : 0.5958574181117534
ExtraTreeClassifier() : 0.5178227360308285


In [70]:
# Create a dummy estimator for random search
class DummyEstimator(BaseEstimator):
    """Pass through class, methods are present but do nothing."""
    def fit(self): pass
    def score(self): pass

pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), 
                ('scaler', StandardScaler()),
                ('classifier', DummyEstimator())])

In [71]:
search_space = [{'classifier': [RandomForestClassifier()],
                'classifier__criterion': ['gini', 'entropy'],
                'classifier__n_estimators': [50,100,150,200],
                'classifier__max_features': ['auto','sqrt','log2'],
                'classifier__max_depth': [20,50,100,200],
                'classifier__class_weight' : ['balanced','balanced_subsample']}]

In [72]:
# Apply RandomizedSearch with pip and search space
clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                   param_distributions=search_space, 
                                   n_iter=50,
                                   cv=2, 
                                   n_jobs=-1,
                                   verbose=1)

best_model = clf_algos_rand.fit(X_train, y_train)

best_model.best_estimator_.get_params()['classifier']

Fitting 2 folds for each of 50 candidates, totalling 100 fits


RandomForestClassifier(class_weight='balanced', max_depth=100, n_estimators=150)

In [91]:
# best model and it's hyperperamters
pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler()),
                ('clm',  RandomForestClassifier(class_weight='balanced',
                                                criterion='entropy',
                                                max_depth=100,
                                                n_estimators=150,
                                                n_jobs=-1))])

In [92]:
# Getting accuracy score for training data from tuned model - worse
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
accuracy = accuracy_score(y_pred, y_val)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.6907514450867052


In [97]:
# Fit tuned model to entire training data
pipe.fit(X, y)
y_pred = pipe.predict(X_test)


In [133]:
y_submission['target'] = pd.Series(y_pred)
y_submission.head()

Unnamed: 0,obs_id,target
0,0,1
1,1,1
2,2,0
3,3,1
4,4,1


In [135]:
y_submission = y_submission.set_index('obs_id')
y_submission.head()

Unnamed: 0_level_0,target
obs_id,Unnamed: 1_level_1
0,1
1,1
2,0
3,1
4,1


In [136]:
y_submission.to_csv('progress_report1_predictions.csv')

In [138]:
# PCA and Kmeans clusters
pipe2 = Pipeline([('pca', PCA(n_components=.99)), # found that 4 components retain 99% variance of y
                ('k-mean', KMeans(n_clusters=20)),
                ('scaler', StandardScaler()),
                ('clm',  ExtraTreesClassifier(class_weight='balanced_subsample',
                                                  criterion='entropy',
                                                  max_depth=100, 
                                                  max_features='log2',  
                                                  n_estimators=50))])

# Getting accuracy score for training data from tuned model - worse
pipe2.fit(X_train, y_train)
y_pred = pipe2.predict(X_val)
accuracy = accuracy_score(y_pred, y_val)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.6835260115606936


In [157]:
# Stacked models
estimators = [('passive_clf', ExtraTreesClassifier(class_weight='balanced_subsample'))]
clf_stacked = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf_stacked.fit(X_train, y_train)
y_pred = clf_stacked.predict(X_val)
accuracy = accuracy_score(y_pred, y_val)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.6994219653179191
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [158]:
# Fit stacked models to 
clf_stacked.fit(X, y)
y_pred_stacked = clf_stacked.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [159]:
y_pred_stacked.shape

(2041,)

In [161]:
y_submission['target'] = pd.Series(y_pred_stacked)
# y_submission = y_submission.set_index('obs_id')

In [162]:
y_submission.to_csv('progress_report1_stacked_pred.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7b6df131-18b7-4fcc-a0ae-f6c2332f328e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>