In [None]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple

In [None]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from imblearn.over_sampling import SMOTE

class DatasetName:
    train_data: pd.DataFrame
    test_data: pd.DataFrame

    # do not call without fabric method
    def __init__(self, train_data: pd.DataFrame, test_data: pd.DataFrame):
        self.train_data = train_data
        self.test_data = test_data

    @classmethod
    def load(cls) :
        """
        loads datasets with given name
        :return: (train_dataframe, test_dataframe)
        """
        df = pd.read_csv("data/star_classification_train.csv")
        return DatasetName(df, None)

    def prepare(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        does preprocessing with dataset, feature generation, etc.
        :return: (train_x, train_y, test_x, test_y)
        """
        df = self.train_data

        categorical_list = ["class"]
        df_categoric = df.loc[:, categorical_list]
        df["class"]=[0 if i == "GALAXY" else 1 if i == "STAR" else 2 for i in df["class"]]

        """
        clf = LocalOutlierFactor()
        y_pred = clf.fit_predict(df) 
        x_score = clf.negative_outlier_factor_

        outlier_score = pd.DataFrame()
        outlier_score["score"] = x_score
        threshold = np.quantile(x_score , .10)                                            
        filtre = outlier_score["score"] < threshold
        outlier_index = outlier_score[filtre].index.tolist()

        df.drop(outlier_index, inplace=True)
        """

        df = df.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','fiber_ID'], axis = 1)

        x = df.drop(['class'], axis = 1)
        y = df.loc[:,'class'].values

        x, y = SMOTE(random_state=42).fit_resample(x, y)

        train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.33,random_state=1729,
                                                            stratify=y)

        return train_x, train_y, test_x, test_y

In [None]:
(train_x, train_y, test_x, test_y) = StarClassification.load().prepare()

from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=20, criterion="gini", max_depth=7)
random_forest.fit(train_x, train_y)
print("accuracy: ", np.mean(test_y == random_forest.predict(test_x)))

accuracy:  0.9750386569472057


In [None]:
#GridSearch

param_grid = { 
    'max_depth' : [4,5,6,7],
    'criterion' :['gini', 'entropy']
}

grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)
grid.fit(train_x, train_y)

grid.best_params_