In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler

col_names = ["Mean of the integrated profile", "Standard deviation of the integrated profile", "Excess kurtosis of the integrated profile",
            "Skewness of the integrated profile", "Mean of the DM-SNR curve", "Standard deviation of the DM-SNR curve",
            "Excess kurtosis of the DM-SNR curve", "Skewness of the DM-SNR curve", "target_class"]

df_train = pd.read_csv("pulsar_data_train.csv", header = 0, names=col_names)
df_test = pd.read_csv("pulsar_data_test.csv", header = 0, names=col_names)

df_aim = df_test

features_train, features_test, target_train, target_test = train_test_split(df_train.iloc[:,:-1], df_train.iloc[:,-1:], test_size = 0.3)

In [17]:
df_train.describe()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
count,12528.0,12528.0,10793.0,12528.0,12528.0,11350.0,12528.0,11903.0,12528.0
mean,111.041841,46.521437,0.478548,1.778431,12.674758,26.351318,8.333489,105.525779,0.092034
std,25.672828,6.801077,1.064708,6.20845,29.61323,19.610842,4.535783,107.399585,0.289085
min,5.8125,24.772042,-1.738021,-1.791886,0.213211,7.370432,-3.13927,-1.976976,0.0
25%,100.871094,42.362222,0.024652,-0.188142,1.910535,14.404353,5.803063,35.199899,0.0
50%,115.183594,46.931022,0.223678,0.203317,2.792642,18.412402,8.451097,83.126301,0.0
75%,127.109375,50.979103,0.473125,0.932374,5.413253,28.337418,10.727927,139.99785,0.0
max,189.734375,91.808628,8.069522,68.101622,222.421405,110.642211,34.539844,1191.000837,1.0


In [40]:
target_train.isna().sum()

target_class    0
dtype: int64

In [77]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline

num_cols_list = ['Mean of the integrated profile',
       'Standard deviation of the integrated profile',
       'Excess kurtosis of the integrated profile',
       'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
       'Standard deviation of the DM-SNR curve',
       'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve']

cat_cols_list = []

preprocessor = ColumnTransformer(transformers=[("knn_imputer", KNNImputer(n_neighbors=5,
                                                                         weights="uniform"), num_cols_list),
                                              ("scaler", StandardScaler(), num_cols_list)],
                                remainder = "passthrough")

pipeline_pre_rfc = Pipeline(steps = [("preprocessor", preprocessor),
                         ("rfc", RandomForestClassifier(n_estimators = 10,
                                                         max_depth = 5,
                                                         random_state = 42,
                                                         n_jobs = -1))])

pd.DataFrame(preprocessor.fit_transform(features_train))

#pd.DataFrame(preprocessor.transform(features_test))

#pipeline_pre_rfc.fit_transform(features_train, target_train)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,48.726562,34.174807,3.856359,19.611846,21.820234,52.397937,2.777642,7.233265,-2.410885,-1.800605,3.124430,2.808597,0.303949,1.297636,-1.213156,-0.911118
1,13.726562,27.718652,6.740331,49.216050,96.749164,73.919704,0.291496,-0.751527,-3.769891,-2.744672,5.799140,7.482895,2.834928,2.380954,-1.759569,-0.985706
2,117.265625,50.148137,0.233556,0.119018,19.483278,55.801223,2.686856,5.657202,0.250402,0.535134,-0.235501,-0.269185,0.225010,1.468943,-1.233110,-0.925840
3,121.359375,45.020782,0.134527,0.245245,9.222408,38.505123,4.180649,16.250250,0.409357,-0.214626,,-0.249255,-0.121585,0.598328,-0.904799,-0.826888
4,119.148438,52.481409,-0.026926,-0.425802,1.151338,11.087733,16.395983,357.077023,0.323509,0.876322,-0.477082,-0.355208,-0.394213,,1.779924,2.356857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8764,125.796875,44.220393,0.196500,0.733665,4.193144,20.745116,6.364105,47.437481,0.581660,-0.331665,-0.269868,-0.172136,-0.291466,-0.295638,-0.424913,-0.535561
8765,127.773438,53.359514,0.058327,-0.549112,11.818562,40.746963,3.650992,12.912732,0.658407,1.004725,-0.398015,-0.374678,-0.033891,,-1.021209,-0.858065
8766,106.875000,47.571328,0.199440,0.284964,3.079431,20.984455,8.427475,78.259366,-0.153053,0.158333,-0.267141,-0.242983,-0.329085,-0.283591,0.028581,-0.247646
8767,126.296875,46.356189,0.178590,0.437049,2.011706,12.447073,10.646090,164.083299,0.601074,-0.019353,-0.286478,-0.218970,-0.365151,-0.713328,0.516195,0.554056


AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [None]:
# Setting up a preproccesor for flexability for the future if more column types were to be added
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

num_cols_list = ['Mean of the integrated profile',
       'Standard deviation of the integrated profile',
       'Excess kurtosis of the integrated profile',
       'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
       'Standard deviation of the DM-SNR curve',
       'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve']

cat_cols_list = []

preprocessor = ColumnTransformer(transformers=[("knn_imputer", KNNImputer(n_neighbors=5,
                                                                         weights="uniform"), num_cols_list),
                                              ("scaler", StandardScaler(), num_cols_list)])

pipeline_pre_rfc = Pipeline([("preprocessor", preprocessor),
                           ("rfc", RandomForestClassifier(n_estimators = 10,
                                                         max_depth = 5,
                                                         random_state = 42,
                                                         n_jobs = -1))])

preprocessor.fit_transform(features_train)
#clean_features_test = pd.DataFrame(preprocessor.transform(features_test), columns=col_names)

pipeline_pre_rfc.fit(features_train, target_train)