In [9]:
import os
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [10]:
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

from sklearn.linear_model import LinearRegression, LogisticRegression

In [3]:
# Train and test data paths will be available as env variables during evaluation
TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH")
TEST_DATA_PATH = os.getenv("TEST_DATA_PATH")

In [4]:
# Train and test data paths will be available as env variables during evaluation
TRAIN_DATA_PATH = './143e2751-7e99-4d17-bb9b-f0faec66e4b9_train.csv'
TEST_DATA_PATH = './83f63b01-14ae-450d-98cb-328e9467162f_test.csv'

In [5]:
# Prepare the training data
train_data = pd.read_csv(TRAIN_DATA_PATH)
X_train, y_train = train_data.iloc[:,:-1], train_data.iloc[:,-1]

# Train the model
classifier = SVC(gamma='auto')
classifier.fit(X_train, y_train)

# Predict on the test set
test_data = pd.read_csv(TEST_DATA_PATH)
submission = classifier.predict(test_data)
submission = pd.DataFrame(submission)

# Export the prediction as submission.csv
submission.to_csv('submission.csv', header=['class'], index=False) 

In [6]:
train_data.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att16,att17,att18,att19,att20,att21,att22,att23,att24,class
0,1,1,1,1,1,0,1,1,0,1,...,1,1,1,1,0,0,1,1,0,8
1,1,1,1,1,1,1,1,1,0,1,...,1,0,0,0,1,1,1,0,0,9
2,0,1,1,0,0,1,0,1,1,1,...,1,0,0,1,1,1,1,0,1,7
3,1,1,0,1,1,1,1,0,1,1,...,0,0,0,0,0,1,1,1,1,6
4,1,1,1,0,1,1,1,1,0,1,...,0,1,0,0,0,1,1,1,1,0


In [7]:
test_data.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att15,att16,att17,att18,att19,att20,att21,att22,att23,att24
0,0,0,1,0,0,1,0,1,1,1,...,0,0,0,0,0,1,0,1,0,0
1,1,1,1,1,1,1,1,1,0,1,...,0,1,1,0,0,1,1,0,0,1
2,0,1,1,0,1,1,1,1,0,0,...,1,0,0,0,0,0,1,1,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,1,1,1,0,0,0
4,1,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,0,0,0,1


In [11]:
train_X, val_X, train_y, val_y = train_test_split(X_train,y_train,test_size=0.3)

In [12]:
classifier = SVC(gamma='auto')
classifier.fit(train_X, train_y)

preds_y = classifier.predict(val_X)

print(classification_report(val_y,preds_y))

              precision    recall  f1-score   support

           0       0.70      0.83      0.76       221
           1       0.78      0.79      0.78       247
           2       0.80      0.85      0.82       220
           3       0.64      0.62      0.63       200
           4       0.75      0.79      0.77       204
           5       0.80      0.75      0.77       244
           6       0.80      0.71      0.76       223
           7       0.76      0.73      0.75       248
           8       0.62      0.62      0.62       213
           9       0.64      0.60      0.62       230

    accuracy                           0.73      2250
   macro avg       0.73      0.73      0.73      2250
weighted avg       0.73      0.73      0.73      2250



In [15]:
for num_nodes in [10,20,30,35,40,50,100,200,300,400,500,600,700,800,900]:
    classifier = DecisionTreeClassifier(max_leaf_nodes=num_nodes)
    classifier.fit(train_X, train_y)

    preds_y = classifier.predict(val_X)
    print(num_nodes)
    print(classification_report(val_y,preds_y))

10
              precision    recall  f1-score   support

           0       0.80      0.65      0.72       221
           1       0.75      0.71      0.73       247
           2       0.66      0.79      0.72       220
           3       0.57      0.62      0.59       200
           4       0.76      0.68      0.72       204
           5       0.73      0.76      0.75       244
           6       0.73      0.73      0.73       223
           7       0.67      0.59      0.63       248
           8       0.64      0.65      0.64       213
           9       0.57      0.64      0.60       230

    accuracy                           0.68      2250
   macro avg       0.69      0.68      0.68      2250
weighted avg       0.69      0.68      0.68      2250

20
              precision    recall  f1-score   support

           0       0.80      0.65      0.72       221
           1       0.79      0.71      0.75       247
           2       0.94      0.73      0.82       220
           3      

In [19]:
for num_nodes in [10,20,30,35,40,50,100,200,300,400,500,600,700,800,900,1000,1500,2000,5000]:
    classifier = DecisionTreeRegressor(max_leaf_nodes=num_nodes)
    classifier.fit(train_X, train_y)

    preds_y = np.around(classifier.predict(val_X))
    print(num_nodes)
    print(classification_report(val_y,preds_y))

10
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       221
           1       0.00      0.00      0.00       247
           2       0.26      0.80      0.39       220
           3       0.00      0.00      0.00       200
           4       0.37      0.75      0.49       204
           5       0.00      0.00      0.00       244
           6       0.23      0.75      0.36       223
           7       0.00      0.00      0.00       248
           8       0.00      0.00      0.00       213
           9       0.00      0.00      0.00       230

    accuracy                           0.22      2250
   macro avg       0.09      0.23      0.12      2250
weighted avg       0.08      0.22      0.12      2250

20
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       221
           1       0.01      0.01      0.01       247
           2       0.40      0.73      0.51       220
           3      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


400
              precision    recall  f1-score   support

           0       0.53      0.29      0.37       221
           1       0.27      0.18      0.22       247
           2       0.40      0.76      0.52       220
           3       0.49      0.32      0.38       200
           4       0.49      0.73      0.58       204
           5       0.62      0.67      0.65       244
           6       0.54      0.65      0.59       223
           7       0.55      0.54      0.54       248
           8       0.29      0.39      0.34       213
           9       0.45      0.10      0.17       230

    accuracy                           0.46      2250
   macro avg       0.46      0.46      0.44      2250
weighted avg       0.46      0.46      0.44      2250

500
              precision    recall  f1-score   support

           0       0.61      0.51      0.56       221
           1       0.55      0.26      0.36       247
           2       0.43      0.79      0.56       220
           3    

In [20]:
classifier = RandomForestClassifier()
classifier.fit(train_X, train_y)

preds_y = classifier.predict(val_X)
print(classification_report(val_y,preds_y))

              precision    recall  f1-score   support

           0       0.72      0.77      0.75       221
           1       0.78      0.81      0.80       247
           2       0.80      0.80      0.80       220
           3       0.59      0.67      0.63       200
           4       0.80      0.81      0.80       204
           5       0.76      0.76      0.76       244
           6       0.76      0.73      0.74       223
           7       0.78      0.72      0.75       248
           8       0.66      0.63      0.64       213
           9       0.65      0.60      0.62       230

    accuracy                           0.73      2250
   macro avg       0.73      0.73      0.73      2250
weighted avg       0.73      0.73      0.73      2250



In [21]:
classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [22]:
for num_est in [10,20,30,50,100,150,200,250,300,400,500,600]:
    classifier = RandomForestClassifier(n_estimators=num_est)
    classifier.fit(train_X, train_y)

    preds_y = classifier.predict(val_X)
    print(num_est)
    print(classification_report(val_y,preds_y))

10
              precision    recall  f1-score   support

           0       0.66      0.77      0.71       221
           1       0.77      0.79      0.78       247
           2       0.77      0.86      0.81       220
           3       0.55      0.65      0.59       200
           4       0.71      0.81      0.76       204
           5       0.72      0.69      0.71       244
           6       0.69      0.65      0.67       223
           7       0.77      0.69      0.73       248
           8       0.63      0.54      0.58       213
           9       0.65      0.50      0.57       230

    accuracy                           0.70      2250
   macro avg       0.69      0.69      0.69      2250
weighted avg       0.70      0.70      0.69      2250

20
              precision    recall  f1-score   support

           0       0.69      0.79      0.74       221
           1       0.78      0.81      0.79       247
           2       0.77      0.84      0.80       220
           3      

600
              precision    recall  f1-score   support

           0       0.72      0.77      0.74       221
           1       0.80      0.80      0.80       247
           2       0.82      0.81      0.82       220
           3       0.60      0.64      0.62       200
           4       0.79      0.83      0.81       204
           5       0.77      0.77      0.77       244
           6       0.76      0.73      0.74       223
           7       0.79      0.74      0.76       248
           8       0.66      0.63      0.65       213
           9       0.64      0.61      0.62       230

    accuracy                           0.74      2250
   macro avg       0.73      0.73      0.73      2250
weighted avg       0.74      0.74      0.73      2250



In [23]:
for num_est in [10,20,30,50,100,150,200,250,300,400,500,600]:
    classifier = RandomForestRegressor(n_estimators=num_est)
    classifier.fit(train_X, train_y)

    preds_y = np.around(classifier.predict(val_X))
    print(num_est)
    print(classification_report(val_y,preds_y))

10
              precision    recall  f1-score   support

           0       0.76      0.11      0.20       221
           1       0.34      0.17      0.22       247
           2       0.42      0.67      0.52       220
           3       0.21      0.23      0.22       200
           4       0.32      0.58      0.41       204
           5       0.33      0.42      0.37       244
           6       0.28      0.57      0.38       223
           7       0.24      0.23      0.23       248
           8       0.32      0.22      0.26       213
           9       0.47      0.03      0.06       230

    accuracy                           0.32      2250
   macro avg       0.37      0.32      0.29      2250
weighted avg       0.37      0.32      0.29      2250

20
              precision    recall  f1-score   support

           0       0.80      0.14      0.25       221
           1       0.40      0.23      0.30       247
           2       0.44      0.68      0.54       220
           3      

600
              precision    recall  f1-score   support

           0       0.80      0.05      0.10       221
           1       0.26      0.13      0.18       247
           2       0.41      0.70      0.51       220
           3       0.19      0.19      0.19       200
           4       0.38      0.65      0.48       204
           5       0.31      0.40      0.35       244
           6       0.28      0.61      0.38       223
           7       0.20      0.21      0.21       248
           8       0.21      0.12      0.15       213
           9       0.60      0.01      0.03       230

    accuracy                           0.30      2250
   macro avg       0.36      0.31      0.26      2250
weighted avg       0.36      0.30      0.26      2250



In [24]:
classifier = RandomForestClassifier(n_estimators=200)
classifier.fit(train_X, train_y)

preds_y = np.around(classifier.predict(val_X))
print(classification_report(val_y,preds_y))

              precision    recall  f1-score   support

           0       0.71      0.78      0.75       221
           1       0.78      0.79      0.79       247
           2       0.83      0.82      0.83       220
           3       0.62      0.66      0.64       200
           4       0.79      0.82      0.81       204
           5       0.77      0.77      0.77       244
           6       0.77      0.74      0.75       223
           7       0.78      0.73      0.75       248
           8       0.67      0.64      0.65       213
           9       0.64      0.62      0.63       230

    accuracy                           0.74      2250
   macro avg       0.74      0.74      0.74      2250
weighted avg       0.74      0.74      0.74      2250

