# Using xgboost to classify data

Classification of Galaxies, Stars and Quasars based on the RD14 from the SDSS¶

# Setting up data

In [1]:
#import pandas
import pandas as pd

# read data
data = pd.read_csv('./Skyserver_SQL2_27_2018 6_51_39 PM.csv')


In [2]:
data.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,4,267,3.72236e+18,STAR,-9e-06,3306,54922,491
1,1.23765e+18,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,4,267,3.63814e+17,STAR,-5.5e-05,323,51615,541
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.23274e+17,GALAXY,0.123111,287,52023,513
3,1.23765e+18,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752,301,4,269,3.72237e+18,STAR,-0.000111,3306,54922,510
4,1.23765e+18,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752,301,4,269,3.72237e+18,STAR,0.00059,3306,54922,512


In [2]:
# select X and y from data
#Dropping object type data from array for x.Class is to be predicted so that will be Y
X = data.drop(['class', 'objid', 'specobjid', ], axis=1).select_dtypes(exclude=['object'])
y = data['class']

In [3]:
from sklearn.model_selection import train_test_split

seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)


## Fitting the model
### Calculating accuracy
### Confusion metric
Each row of the matrix represents the instances in a predicted class while each column represents the instances in an actual class (or vice versa). The name stems from the fact that it makes it easy to see if the system is confusing two classes (i.e. commonly mislabeling one as another).
https://en.wikipedia.org/wiki/Confusion_matrix

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

model = XGBClassifier()
# model = RandomForestClassifier()


model.fit(X_train, y_train, verbose=False)
# make predictions for test data
X_test.astype(float)

y_pred = model.predict(X_test)
    
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

confusion_matrix(y_test, y_pred)


Accuracy: 99.12%


array([[1613,    9,    4],
       [  16,  274,    0],
       [   0,    0, 1384]])

## More literature

https://www.datacamp.com/community/tutorials/xgboost-in-python

https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/