In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import RandomOverSampler

## Star Dataset from: 
https://www.kaggle.com/datasets/deepu1109/star-dataset

Brown Dwarf -> Star Type = 0

Red Dwarf -> Star Type = 1

White Dwarf-> Star Type = 2

Main Sequence -> Star Type = 3

Supergiant -> Star Type = 4

Hypergiant -> Star Type = 5

In [21]:
df = pd.read_csv("stars.csv")
df.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M
3,2800,0.0002,0.16,16.65,0,Red,M
4,1939,0.000138,0.103,20.06,0,Red,M


In [22]:
import re

# df.isnull().sum()
# df["Star type"].unique()

# print(df[df['Star color'] == "Blue-white"]["Star type"])
# print(df[df['Star color'] == "Blue White"]["Star type"])

def preprocess_color(color_name):
  new_name = color_name.replace(" ", "-").lower()
  if new_name[-1] == '-':
    new_name = new_name[:-1]
  if new_name == "yellow-white":
    new_name = "white-yellow"
  return new_name

# print(df['Star color'].value_counts())

# Fix some color names
df['Star color'] = df['Star color'].apply(preprocess_color)

# Move 'Star type' column to the end
df = df[[col for col in df if col not in ['Star type']] + ['Star type']]
df.head()


# data = [8, 8, 9, 12, 13, 13, 14, 14, 15, 18, 22, 23, 24, 25, 30]
# fig, ax = plt.subplots()
# ax.hist(df["Star color"], edgecolor="black")

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star color,Spectral Class,Star type
0,3068,0.0024,0.17,16.12,red,M,0
1,3042,0.0005,0.1542,16.6,red,M,0
2,2600,0.0003,0.102,18.7,red,M,0
3,2800,0.0002,0.16,16.65,red,M,0
4,1939,0.000138,0.103,20.06,red,M,0


In [None]:
colors = ["red", "blue", "green", "orange", "yellow", "purple"]
labels = ["brown dwarf", "red dwarf", "white dwarf", "main sequence", "supergiant", "hypergiant"] # from repo https://github.com/ACM-Research/coding-challenge-2023-spring
for label in df.columns:
  # Plot histogram of each column for each star type
  if label == "Star type":
    continue;
  if label == "Star color" or label == "Spectral Class": # Just plot a simple bar chart for categorical
    plt.bar(df[label].value_counts().index, df[label].value_counts().values, edgecolor="black")
  else: 
    for star_type in range(6): # Plot a histogram to compare current column with each star type
      plt.hist(df[df['Star type'] == star_type][label], color=colors[star_type], label=labels[star_type], alpha=0.7, density=False) 
      plt.legend()
  plt.ylabel("Frequency")
  plt.title(label)
  plt.xlabel(label)  
  plt.show()


# Train, validation, test datasets

In [69]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))]) # 60-20-20

for i in range(6):
    print(len(train[train["Star type"] == i]))

21
26
23
22
26
26


In [None]:
from pandas import DataFrame


def preprocess(dataframe: DataFrame, oversample=False):
  columns_to_encode = ["Star color", "Spectral Class"]
  columns_to_scale = ["Temperature (K)", "Luminosity(L/Lo)", "Radius(R/Ro)", "Absolute magnitude(Mv)"]
    
  scaler = StandardScaler()
  ohe = OneHotEncoder(sparse_output=False)
  
  scaled = scaler.fit_transform(dataframe[columns_to_scale])
  encoded = ohe.fit_transform(dataframe[columns_to_encode])  
  
  processed_data = np.concatenate([scaled, encoded], axis=1)
  
  X = processed_data
  y = dataframe["Star type"].values    
  
  if oversample: # Likely not necessary here
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)
    
  # all_columns = np.concatenate((columns_to_scale, ohe.get_feature_names_out(columns_to_encode), ["Star type"]))
  
  # print(pd.DataFrame(np.hstack((X, np.reshape(y, (-1, 1)))), columns=all_columns).head())
    
  return np.hstack((X, np.reshape(y, (-1, 1)))), X, y



train, X_train, y_train = preprocess(train)
valid, X_valid, y_valid = preprocess(train)
test, X_test, y_test = preprocess(train)
# for i in range(6):
#     print(sum(y_train == i))

# KNN