# **Importing required libraries**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
train_dataset = pd.read_csv('/content/drive/MyDrive/CodeFiles/NSSC/train_NoNTTqq.csv')
test_dataset = pd.read_csv('/content/drive/MyDrive/CodeFiles/NSSC/test_SxgqOdc.csv')

In [None]:
test_X = test_dataset.drop('id', axis = 1)

In [None]:
test_X.head()

Unnamed: 0,ultraviolet_filter,green_filter,red_filter,near_infrared_filter,alpha,delta,redshift
0,19.63144,17.8884,16.45195,16.620047,336.501421,2.415351,0.00029
1,25.74819,22.1076,19.97196,19.179141,210.286161,-1.336858,0.513781
2,22.38767,20.85446,20.75418,20.073627,262.91477,46.025803,0.985297
3,22.03212,23.15455,21.86528,20.746343,146.381732,38.368224,0.712391
4,25.01815,24.53933,22.03248,22.377272,246.456081,30.515558,0.495552


In [None]:
len(train_dataset), len(test_dataset)

(134911, 89941)

# **Exploratory Analysis**

In [None]:
train_dataset.head()

Unnamed: 0,id,ultraviolet_filter,green_filter,red_filter,near_infrared_filter,alpha,delta,redshift,stellar
0,1,17.44385,15.71196,16.14848,15.647619,158.167937,29.746275,0.094857,1
1,2,22.02806,24.01481,21.16334,20.214615,145.916931,38.083063,0.361631,1
2,3,23.07242,21.79252,20.51945,18.159421,245.684677,49.908866,-6.5e-05,2
3,4,23.45985,23.41583,20.36645,20.220636,204.81275,33.137303,0.643375,1
4,5,23.89627,23.18005,21.12911,19.81847,209.254795,55.296589,0.486448,1


In [None]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134911 entries, 0 to 134910
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    134911 non-null  int64  
 1   ultraviolet_filter    134911 non-null  float64
 2   green_filter          134911 non-null  float64
 3   red_filter            134911 non-null  float64
 4   near_infrared_filter  134911 non-null  float64
 5   alpha                 134911 non-null  float64
 6   delta                 134911 non-null  float64
 7   redshift              134911 non-null  float64
 8   stellar               134911 non-null  int64  
dtypes: float64(7), int64(2)
memory usage: 9.3 MB


In [None]:
train_dataset.nunique().to_frame().transpose()

Unnamed: 0,id,ultraviolet_filter,green_filter,red_filter,near_infrared_filter,alpha,delta,redshift,stellar
0,134911,124479,123148,121505,133528,134888,134898,134869,3


In [None]:
occurrences = train_dataset['stellar'].value_counts().to_frame().rename(index=str, columns={'stellar': 'Occurrences'})
occurrences

Unnamed: 0,Occurrences
1,86701
2,33121
3,15089


# **Building the Random Forest Classifier**


In [None]:
X = train_dataset.drop(['id', 'stellar'], axis = 1)
y = train_dataset['stellar']

In [None]:
X.head()

Unnamed: 0,ultraviolet_filter,green_filter,red_filter,near_infrared_filter,alpha,delta,redshift
0,17.44385,15.71196,16.14848,15.647619,158.167937,29.746275,0.094857
1,22.02806,24.01481,21.16334,20.214615,145.916931,38.083063,0.361631
2,23.07242,21.79252,20.51945,18.159421,245.684677,49.908866,-6.5e-05
3,23.45985,23.41583,20.36645,20.220636,204.81275,33.137303,0.643375
4,23.89627,23.18005,21.12911,19.81847,209.254795,55.296589,0.486448


In [None]:
X.shape

(134911, 7)

## Training and Test Set Split

In [None]:
#X = X.iloc[ : 1349]
#y = y.iloc[ : 1349]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
clf = RandomForestClassifier()

In [None]:
scaling = MinMaxScaler(feature_range = (-1,1)).fit(x_train)
x_train_scaled = scaling.transform(x_train)
x_test_scaled = scaling.transform(x_test)

## Hyperparameter Optimisation

In [None]:
from pprint import pprint
pprint(clf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


## Tuning Using Random Search


In [None]:
from sklearn.model_selection import RandomizedSearchCV
hyperparameters = {'max_features':[None, 'auto', 'sqrt', 'log2'],
                   'max_depth':[None, 1, 5, 10, 15, 20],
                   'min_samples_leaf': [1, 2, 4],
                   'min_samples_split': [2, 5, 10],
                   'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)],
                   'criterion': ['gini', 'entropy']}
rf_random = RandomizedSearchCV(clf, hyperparameters, n_iter = 100, cv = 10, verbose=2, random_state=123, n_jobs = -1)
rf_random.fit(x_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [None]:
rf_random.best_params_

## Tuning Using Grid Search

In [None]:
hyperparameters = {'max_features':[None],
                   'max_depth':[14, 15, 16],
                   'min_samples_leaf': [1, 2, 3],
                   'min_samples_split': [4, 5, 6],
                   'n_estimators': [90, 100, 110],
                   'criterion': ['entropy']}
rf_grid = GridSearchCV(clf, hyperparameters, cv = 10, n_jobs = -1, verbose = 2)
rf_grid.fit(x_train, y_train)

In [None]:
rf_grid.best_params_

## Training the Classifier


In [None]:
clf.set_params(criterion = 'entropy', max_features = None, max_depth = 14, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 100)

In [None]:
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

## Accuracy Score and F1 Score for training dataset



In [None]:
sortedlabels = clf.classes_
accscore = accuracy_score(y_test, y_pred)
f1score = f1_score(y_test, y_pred, average = None)
print(accscore*100)
print(f1score*100)

## Accuracy Score and F1 Score for test dataset



In [None]:
y_pred_2 = clf.predict(test_X)

In [None]:
y_pred_2.shape

In [None]:
test_result_df = pd.DataFrame(y_pred_2)
test_result_df.shape

In [None]:
test_dataset['id']

In [None]:
frames = [test_dataset['id'], test_result_df]
result = pd.concat(frames, axis=1, join='inner')
display(result)

In [None]:
result.to_csv('resultss', index = False)

In [None]:
y_pred_2