# Kaggle Dataset

Download data from the following source, and add it to the data folder:
https://www.kaggle.com/datasets/fedesoriano/stellar-classification-dataset-sdss17

In [13]:
# Load Libraries
from pathlib import Path
import sys
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

In [25]:
# Set working directory
os.chdir(os.path.join(sys.path[0], ".."))

# Create processed data folder
data_folder = os.path.join(".", "data")
processed_folder = os.path.join(".", "data", "processed")

if not os.path.exists(processed_folder):
    os.makedirs(processed_folder)

In [50]:
# Create folder for star data
star_folder = os.path.join(".", "data", "processed", "stars")

if not os.path.exists(star_folder):
    os.makedirs(star_folder)
    
# Read in star data
star_raw_path = os.path.join(data_folder, "raw", "star_classification.csv")
star_data = pd.read_csv(star_raw_path).query('`class` == "STAR"')

# Separate stars
star_data.to_csv(os.path.join(star_folder, "stars.csv"))

In [51]:
# Create folder for qso data
qso_folder = os.path.join(".", "data", "processed", "qso")

if not os.path.exists(qso_folder):
    os.makedirs(qso_folder)
    
# Read in qso data
qso_raw_path = os.path.join(data_folder, "raw", "star_classification.csv")
qso_data = pd.read_csv(qso_raw_path).query('`class` == "QSO"')

# Separate qso
qso_data.to_csv(os.path.join(qso_folder, "qso.csv"))

In [52]:
# Create folder for galaxy data
galaxy_folder = os.path.join(".", "data", "processed", "galaxy")

if not os.path.exists(galaxy_folder):
    os.makedirs(galaxy_folder)
    
# Read in galaxy data
galaxy_raw_path = os.path.join(data_folder, "raw", "star_classification.csv")
galaxy_data = pd.read_csv(galaxy_raw_path).query('`class` == "GALAXY"')

# Separate galaxy
galaxy_data.to_csv(os.path.join(galaxy_folder, "galaxy.csv"))

In [54]:
# Create folder for all data
total_folder = os.path.join(".", "data", "processed", "total")

if not os.path.exists(total_folder):
    os.makedirs(total_folder)
    
# Read in galaxy data
total_raw_path = os.path.join(data_folder, "raw", "star_classification.csv")
total_data = pd.read_csv(total_raw_path)
                         
# Separate into train test
X = total_data.drop('class', axis=1)
y = total_data['class']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

training_data = X_train
training_data['class'] = y_train

test_data = X_valid
test_data['class'] = y_valid

# Separate train and test
training_data.to_csv(os.path.join(total_folder, "train.csv"))
test_data.to_csv(os.path.join(total_folder, "test.csv"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['class'] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['class'] = y_valid


# Galaxy Zoo Data

Go to the [SDSS website](http://skyserver.sdss.org/dr17/SearchTools/SQL/#) and run the following SQL query to download community classified galaxies.

```sql
SELECT
objid, nvote,
p_el as elliptical,
p_cw as spiralclock, p_acw as spiralanticlock, 
p_edge as edgeon,
p_dk as dontknow, p_mg as merger
FROM ZooNoSpec
```

The data contains a galaxy identifier, the number of votes, and the percentage for each classification.

In [83]:
# Read in galaxy data
gz = pd.read_csv(os.path.join(data_folder, "raw", "galaxy_zoo_classifications.csv"), skiprows=1)

# Define classification types
classifications = ['elliptical', 'spiralclock', 'spiralanticlock', 'edgeon', 'dontknow', 'merger']

# Get value classification certainty
gz = gz.assign(certainty=gz[classifications].max(axis=1))

# Select max classification certainty
gz = gz.assign(classification= np.select([gz.certainty == gz.elliptical,
                                       gz.certainty == gz.spiralclock,
                                       gz.certainty == gz.spiralanticlock,
                                       gz.certainty == gz.edgeon,
                                       gz.certainty == gz.dontknow,
                                       gz.certainty == gz.merger], 
                                      ['elliptical',
                                       'spiralclock',
                                       'spiralanticlock',
                                       'edgeon',
                                       'dontknow',
                                       'merger'], 'other'))

# Select columns
gz = gz[['objid', 'certainty', 'classification']]

# Save data
gz.to_csv(os.path.join(galaxy_folder, "galaxy_zoo_classifications.csv"))

In [88]:
# Separate into train test
X = gz.drop('classification', axis=1)
y = gz['classification']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

training_data = X_train
training_data['class'] = y_train

test_data = X_valid
test_data['class'] = y_valid

# Separate train and test
training_data.to_csv(os.path.join(galaxy_folder, "galaxy_train.csv"))
test_data.to_csv(os.path.join(galaxy_folder, "galaxy_test.csv"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['class'] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['class'] = y_valid
