In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# **Get Data** #

In [3]:
# Read Data
data = pd.read_csv('../input/star-categorization-giants-and-dwarfs/Star39552_balanced.csv') # data pre-processed
dataRaw = pd.read_csv('/kaggle/input/star-categorization-giants-and-dwarfs/Star99999_raw.csv') # data initial or Raw 
dataRaw.info()  # review types variable

#  **EDA** #

In [4]:
# convert object to numeric
dataRaw['Vmag'] = pd.to_numeric(dataRaw['Vmag'] , downcast='float' , errors='coerce')
dataRaw['Plx'] = pd.to_numeric(dataRaw['Plx'] , downcast='float' , errors='coerce')
dataRaw['e_Plx'] = pd.to_numeric(dataRaw['e_Plx'] , downcast='float' , errors='coerce')
dataRaw['B-V'] = pd.to_numeric(dataRaw['B-V'] , downcast='float' , errors='coerce')
dataRaw.info()

### Missing Values ###

In [5]:
missingValueCount = dataRaw.isnull().sum()
print('Num_null: ' , missingValueCount.sum() )

dataLabel = dataRaw.dropna()
dataUnlabel = dataRaw.fillna(0)

print( dataLabel.shape , dataRaw.shape , dataUnlabel.shape )

totalData = np.product(dataRaw.shape)
percentMissing=missingValueCount.sum()/totalData

print('Percentage Missing:', '{: .3%}'.format(percentMissing))
dataRaw = dataLabel

In [6]:
dataRaw = dataRaw.drop(['Unnamed: 0'], axis=1)
dataRaw.describe()

In [7]:
data.hist(bins=50 , figsize=(20,15) ) # histograma

## Pre-Processing ##

In [8]:
# New variables

data = dataRaw
data = data[data.Plx != 0]
data = data.reset_index(drop=True)

data['Amag']   = data['Vmag'] + 5*( np.log10( np.abs( data['Plx'] ) )+ 1 )  # log is ln
data['e_Amag'] = 5*data['e_Plx']/(data['Plx']*np.log(10) )   # propagacion de error

data['eA/A']   = data['e_Amag']/data['Amag']
data['eP/P']   = data['e_Plx']/data['Plx']

dataRaw = data
dataRaw.head()

In [9]:
#dataRaw = dataRaw.drop('Unnamed: 0', axis=1)
dataRaw['TargetClass'] = dataRaw['SpType']

dataRaw = dataRaw.reset_index(drop=True) # reindexar debido al drop
dataRaw.head() 

In [10]:
dataRaw.info()

## Target Class: Giants or Dwarfs ##

In [11]:
# Procesamiento target, clasificar en Giants and dwarfs


for i in range(len(dataRaw['TargetClass'])):
    if "I" in dataRaw.loc[i,'TargetClass']: 
        dataRaw.loc[i,'TargetClass'] = 1   # 1 giant       
    elif ("VII" or "VI") in dataRaw.loc[i,'TargetClass']: 
        dataRaw.loc[i,'TargetClass'] = 0 # VII is Dwarf (0)
    elif "V" in dataRaw.loc[i,'TargetClass']: 
        if ('G' or 'K' or 'M' or 'L' or 'T') in  dataRaw.loc[i,'TargetClass']:
            dataRaw.loc[i,'TargetClass'] = 0
        else:
            dataRaw.loc[i,'TargetClass'] = 9
    else: 
        dataRaw.loc[i,'TargetClass'] = 9 # None
        
dataRaw['TargetClass']

dataRaw.head()

In [12]:
sns.countplot( dataRaw['TargetClass'] )

In [13]:
dataRaw['TargetClass'].value_counts()

### Selection only Giants or Dwarfs ###

In [14]:
# copy and drop number 9
df = dataRaw[dataRaw.TargetClass != 9 ] # != return  boolean

## Balance ##

In [15]:
# dividir set 

dfGiants = df[df.TargetClass == 1]
dfDwarfs = df[df.TargetClass == 0]
num_of_giant = dfGiants.shape[0]
num_of_dwarf = dfDwarfs.shape[0]

from sklearn.utils import resample
dfGiantsDownSample = resample(dfGiants, 
                             replace = False, # sample without replacement
                             n_samples = num_of_dwarf,
                             random_state =1 
                             )
dfDownSampled = pd.concat([dfGiantsDownSample , dfDwarfs] )

In [16]:
sns.countplot(dfDownSampled['TargetClass'])
print( dfDownSampled['TargetClass'].value_counts())

In [17]:
dfBalanced= dfDownSampled.reset_index(drop=True)
dfBalanced.info()

In [18]:
dataStar = dfBalanced.sample(frac=1).reset_index(drop=True) # shuffle
dfCopyStar = dataStar.copy()


## Visualization ##

In [19]:
%matplotlib inline

sns.set()
dataStar.plot.scatter( x="Plx" ,y="Amag",cmap='viridis' ,  c='TargetClass' ,s=5); # problem with TargetClass
plt.xlabel("Plx")
plt.figure()
plt.show()


In [20]:
%matplotlib inline
sns.set()
dataStar.plot.scatter( x="B-V" ,y="Amag",cmap='viridis' ,  c='TargetClass' ,s=5); # problem with TargetClass
plt.xlabel("Plx")
plt.figure()
plt.show()



In [21]:
# scatter attributes

from pandas.plotting import scatter_matrix

attributes = ['TargetClass','Amag' , 'Vmag' , 'Plx' , 'B-V']

# convert a numerical Targetclass 
dataStar['TargetClass'] = pd.to_numeric(dataStar['TargetClass'] , downcast='float' , errors='coerce')

scatter_matrix(dataStar[attributes] , figsize=(12, 8))
dataStar.isnull().sum()


## Correlation Linear ##

In [22]:
corr_matrix = dataStar.corr()
corr_matrix["TargetClass"].sort_values(ascending=False)

## Data Cleaning ##

In [23]:
#dataStar = dataStar.drop(['Unnamed: 0'], axis =1)
dataStar = dataStar.drop(['SpType'], axis =1)



# **Data Processing** #

In [24]:
dataStar.info()

### Train and test Set ###

In [25]:
from sklearn.model_selection import train_test_split


train , test = train_test_split(dataStar , test_size = 0.2, random_state = 42 )
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [26]:
## Stratified ##
#from sklearn.model_selection import StratifiedShuffleSplit
#split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
#for train_index, test_index in split.split(data, data["variableStratificada"]):
#strat_train_set = housing.loc[train_index]
#strat_test_set = housing.loc[test_index]

#from  sklearn.impute import SimpleImputer
#imputer = SimpleImputer(strategy='median')

#from  sklearn.preprocessing import OneHotEncoder
#catEncoder = OneHotEncoder()

## numerical ##
#from  sklearn.pipeline import Pipeline
#from  sklearn.preprocessing import StandardScaler
#numerical_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('attribs_adder', CombinedAttributesAdder()),])
# new_variable_transformed_numerical = num_pipeline.fit_transform(varible_numerica)

## numerical and categorial ##
#from sklearn import ColumnTransformer
#numerical_attribs = list(numerical_columns)
#categorical_attribs = list(categorial_columns)
#full_pipeline = ColumnTransformer([("num", numerical_pipeline, numerical_attribs),("cat", OneHotEncoder(), categorical_attribs),])
#dataTotal_prepared = full_pipeline.fit_transform(dataTotal) 

## Target ## 

In [27]:
trainX = train.drop('TargetClass', axis=1)
trainY = train['TargetClass'].copy()

testX = test.drop('TargetClass', axis=1)
testY = test['TargetClass'].copy()
trainX.info()

## Featuring Engineering ##

##  Pipeline and Featuring Scaling ##

In [28]:
from  sklearn.impute import SimpleImputer
from  sklearn.preprocessing import OneHotEncoder

from  sklearn.pipeline import Pipeline
from  sklearn.preprocessing import StandardScaler

numerical_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
                              ('std_scaler', StandardScaler())] )

from sklearn.compose import ColumnTransformer

num_attribs = list(trainX)
#cat_attribs = ["ocean_proximity"] ---generar lista de attribus catecorial---
full_pipeline = ColumnTransformer([("num", numerical_pipeline, num_attribs),
#                                   ("cat", OneHotEncoder(), cat_attribs),
                                  ])

Xtrain = full_pipeline.fit_transform(trainX)  # array

print('Done')

 # **MODEL** # 

#### Models ####

In [29]:
# Models ==> ['logisRegre' , 'SVM'   , 'SGDClf' ,  'randomForestClassifier' , 'navieBayes']

# Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC


In [30]:

SGDClf = SGDClassifier(random_state=42)
SGDClf.fit(Xtrain, trainY) # fit equal training

LRClf = LogisticRegression(random_state=0)
RFClf = RandomForestClassifier(random_state=0)
SVClf = LinearSVC(random_state=0)


#SGDClf.fit(Xtrain, trainY)

## Cross-Validation ##

In [31]:
from sklearn.model_selection import cross_val_score

# automatizar entre diversos modelos de la misma categoria
def compareModels(models, x, y, numCV, scoringT="neg_mean_squared_error"):
    modelsCV = dict()
    for mod in models:
        scores = cross_val_score(mod, x, y, scoring = scoringT, cv=numCV)
        meanScore = np.sqrt(-scores).mean()
        modelsCV[meanScore] = mod
    
    val=min(modelsCV)
    bestModel = modelsCV[val]
    return modelsCV,bestModel
    
result, model =compareModels([SGDClf , LRClf, RFClf] , Xtrain , trainY , 6)
print(result); print('\n'); print(model); print('DONE')

In [32]:
model.fit(Xtrain, trainY)

Xtest = full_pipeline.fit_transform(testX)
Ypredicts = model.predict(Xtest)


In [33]:
# metrica
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(testY , Ypredicts) 
rmse = np.sqrt(mse)
rmse

In [60]:
output = pd.DataFrame({'Star': testY, 'StarPred': Ypredicts})

def precision(a, b):
    cont = 0
    for i in np.arange(len(a)):
        if (a[i]==b[i]):
            cont = cont +1
             
            
    return  format( cont/len(a) , '2%') 

precision(testY , Ypredicts)

In [36]:
### stratified CV ### see pag 136  Geron's book
#from sklearn.model_selection import StratifiedKFold
#from sklearn.base import clone
#skfolds = StratifiedKFold(n_splits=3, random_state=42)

# **Refinament** #

# **Deploy** #