## This is a sample Notebook to test the package I made

In [1]:
## This cell just imports sample data to give you an understanding. You can change the path if you want
## To work with different data

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('raw_data/asteroid_training.csv')
data.head()


Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,26150198,Object_26150198,0.031956,0.071456,75374.759095,1070689.0,Earth,False,24.6,False
1,7025688,Object_7025688,0.133216,0.297879,33274.11479,15982170.0,Earth,False,21.5,False
2,43368461,Object_43368461,0.043507,0.097284,74702.349802,2330585.0,Earth,False,23.93,False
3,41099354,Object_41099354,0.012149,0.027167,33078.313997,45611780.0,Earth,False,26.7,False
4,25572576,Object_25572576,0.058151,0.130029,51956.093518,3613123.0,Earth,False,23.3,False


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#Creating the categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

#Creating the column transformer
complete_pipe = ColumnTransformer([
    ('num', num_pipe, data.drop(['hazardous'], axis=1).select_dtypes(include=np.number).columns),
    ('cat', cat_pipe, data.drop(['hazardous'], axis=1).select_dtypes(include='object').columns)
])

In [6]:
# Create an instance of the class
# Parameters are explained. Enter the whole data, the target cell, the pipeline, the task is either
# 'class' for classification or 'reg' for regression, and i is the number of models you want to use
# Depending on the parameters it might take more or less time to run
#from yctmodel import ModelSelector
from yctmodel import ModelSelector
model_selector = ModelSelector(data,target='hazardous',complete_pipe=complete_pipe,task='class',i=2, precision= 0.2)
# Now just run the start function 
model_selector.start()


ImportError: cannot import name 'ModelSelector' from partially initialized module 'yctmodel' (most likely due to a circular import) (/home/yc4923/GEMS/Personal/Repos/thames/yctmodel/__init__.py)

In [10]:
# If you want to get the pipeline run this function
our_model = model_selector.get_pipeline()
our_model

In [11]:
#Now you have your model, you need to fit/train it with your X_train and get the accuracy
#For example
model_selector.evaluate()



       Score  F1 Score
0  90.430622  85.88637


In [7]:
#If you have unknown data and you want to predict it, you can use this function
#For example
unknown_data = pd.read_csv('raw_data/unknown_asteroids.csv')
model_selector.predict_unknown(unknown_data)

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,Predicted
0,769101,Object_769101,0.080270,0.179490,13424.551015,4.402380e+07,Earth,False,22.60,False
1,2678807,Object_2678807,0.160900,0.359782,42382.349410,2.360300e+07,Earth,False,21.09,False
2,49457195,Object_49457195,0.001011,0.002260,34025.400503,4.754401e+07,Earth,False,32.10,False
3,16313339,Object_16313339,0.140138,0.313357,31231.743804,6.324553e+07,Earth,False,21.39,False
4,28929909,Object_28929909,0.066766,0.149293,12905.227080,3.856273e+07,Earth,False,23.00,False
...,...,...,...,...,...,...,...,...,...,...
1495,48611083,Object_48611083,0.020350,0.045503,59142.861233,3.919877e+07,Earth,False,25.58,False
1496,4423994,Object_4423994,0.231502,0.517654,42876.660182,4.656072e+07,Earth,False,20.30,False
1497,5718958,Object_5718958,0.025856,0.057815,61043.549488,5.129316e+07,Earth,False,25.06,False
1498,38339775,Object_38339775,0.319562,0.714562,48822.182935,2.540669e+07,Earth,False,19.60,False


In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from yctmodel import ModelSelector
test_pipe = make_pipeline(complete_pipe, RandomForestClassifier())
tuner = ModelSelector(data,target='hazardous',complete_pipe=complete_pipe,task='class',i=2)
tuner.auto_tuning(test_pipe,data)

Best parameters for RandomForestClassifier: {'class_weight': None, 'max_depth': 21, 'max_features': 7, 'min_samples_split': 3, 'n_estimators': 314}
