In [55]:
import functools
from math import sqrt

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import scipy.stats as sct
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_blobs, make_classification
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, f_classif, mutual_info_classif
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score


from sklearn.tree import DecisionTreeClassifier, plot_tree

In [56]:
target='IN_TREINEIRO'
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [57]:
print(train.shape)
print(test.shape)

(13730, 167)
(4570, 43)


In [58]:
train[target].unique()

array([0, 1], dtype=int64)

In [59]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [60]:
features = [i for i in test.columns.values.tolist() if i not in [target, 'NU_INSCRICAO']]

In [61]:
numeric_features = [i for i in test.select_dtypes(include=['int64', 'float64']).columns if i in features]
categorical_features = [i for i in test.select_dtypes(include=['object']).columns if i in features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [62]:
X_train = train[features]
X_test = test[features]
y_train = train["IN_TREINEIRO"]

In [66]:
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
df_new = test[ ['NU_INSCRICAO']].copy()
df_new['IN_TREINEIRO'] = y_pred
df_new.to_csv('answer.csv', index=False)