In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("data/train_star.csv")
test = pd.read_csv("data/test_star.csv")
train.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,Amag,TargetClass
0,9.99,7.92,1.61,0.646,G5/G6V,19.483625,Giant
1,10.86,3.26,2.12,1.84,M1V:,18.426088,Giant
2,8.83,7.57,1.05,0.461,F3V,18.22548,Giant
3,7.72,24.8,0.89,0.613,G3IV,19.692257,Giant
4,8.81,3.17,1.03,0.872,G6III,16.315296,Dwarf


In [3]:
def parse_sptype(s):
    if pd.isna(s):
        return np.nan, np.nan, np.nan
    TEMP_MAP = {"O": 0, "B": 1, "A": 2, "F": 3, "G": 4, "K": 5, "M": 6}
    LUM_MAP = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
    s = str(s).strip().replace(" ", "").replace(":", "")
    temp_char = s[0]
    temp_code = TEMP_MAP.get(temp_char, np.nan)
    nums = re.findall(r"\d+", s)
    if len(nums) == 0:
        subclass = np.nan
    else:
        nums_float = [float(n) for n in nums]
        subclass = float(np.mean(nums_float))
    lum_code_num = np.nan
    for lum_str in ["III", "II", "IV", "V", "I"]:
        if lum_str in s:
            lum_code_num = LUM_MAP[lum_str]
            break

    return temp_code, subclass, lum_code_num

def temperature(bv):
    if pd.isna(bv):
        return np.nan
    return 4600 * (1/(0.92*bv + 1.7) + 1/(0.92*bv + 0.62))

def distance(plx):
    if pd.isna(plx) or plx <= 0:
        return np.nan
    return 1000.0 / plx

def absolute_mag(Vmag, dist_pc):
    if pd.isna(dist_pc) or dist_pc <= 0:
        return np.nan
    return Vmag - 5 * np.log10(dist_pc) + 5




In [4]:
train[["Sp_temp", "Sp_subclass", "Sp_lum"]] = \
    train["SpType"].apply(lambda x: pd.Series(parse_sptype(x)))

test[["Sp_temp", "Sp_subclass", "Sp_lum"]] = \
    test["SpType"].apply(lambda x: pd.Series(parse_sptype(x)))

train["T_eff"] = train["B-V"].apply(temperature)
test["T_eff"]  = test["B-V"].apply(temperature)

train["Dist_pc"] = train["Plx"].apply(distance)
test["Dist_pc"]  = test["Plx"].apply(distance)

train["M_V"] = train.apply(lambda r: absolute_mag(r.Vmag, r.Dist_pc), axis=1)
test["M_V"]  = test.apply(lambda r: absolute_mag(r.Vmag, r.Dist_pc), axis=1)

for col in ["T_eff", "Dist_pc"]:
    train[f"log_{col}"] = np.log10(train[col].clip(lower=1e-6))
    test[f"log_{col}"]  = np.log10(test[col].clip(lower=1e-6))


train = train.drop("SpType", axis=1)
test = test.drop("SpType", axis=1)

train.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,Amag,TargetClass,Sp_temp,Sp_subclass,Sp_lum,T_eff,Dist_pc,M_V,log_T_eff,log_Dist_pc
0,9.99,7.92,1.61,0.646,19.483625,Giant,4.0,5.5,5.0,5793.079693,126.262626,4.483626,3.76291,2.101275
1,10.86,3.26,2.12,1.84,18.426088,Giant,6.0,1.0,5.0,3344.743474,306.748466,3.426088,3.524363,2.486782
2,8.83,7.57,1.05,0.461,18.22548,Giant,3.0,3.0,5.0,6571.226571,132.100396,3.225479,3.817646,2.120904
3,7.72,24.8,0.89,0.613,19.692257,Giant,4.0,3.0,4.0,5917.104412,40.322581,4.692258,3.772109,1.605548
4,8.81,3.17,1.03,0.872,16.315296,Dwarf,4.0,6.0,3.0,5072.687407,315.457413,1.315296,3.705238,2.498941


In [5]:
for col in ["Sp_temp", "Sp_subclass", "Sp_lum", "Dist_pc", "M_V", "log_Dist_pc"]:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())

train.isna().sum()

Vmag           0
Plx            0
e_Plx          0
B-V            0
Amag           0
TargetClass    0
Sp_temp        0
Sp_subclass    0
Sp_lum         0
T_eff          0
Dist_pc        0
M_V            0
log_T_eff      0
log_Dist_pc    0
dtype: int64

In [6]:
train["TargetClass"] = 0 + (train["TargetClass"] == "Giant")
train.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,Amag,TargetClass,Sp_temp,Sp_subclass,Sp_lum,T_eff,Dist_pc,M_V,log_T_eff,log_Dist_pc
0,9.99,7.92,1.61,0.646,19.483625,1,4.0,5.5,5.0,5793.079693,126.262626,4.483626,3.76291,2.101275
1,10.86,3.26,2.12,1.84,18.426088,1,6.0,1.0,5.0,3344.743474,306.748466,3.426088,3.524363,2.486782
2,8.83,7.57,1.05,0.461,18.22548,1,3.0,3.0,5.0,6571.226571,132.100396,3.225479,3.817646,2.120904
3,7.72,24.8,0.89,0.613,19.692257,1,4.0,3.0,4.0,5917.104412,40.322581,4.692258,3.772109,1.605548
4,8.81,3.17,1.03,0.872,16.315296,0,4.0,6.0,3.0,5072.687407,315.457413,1.315296,3.705238,2.498941


In [7]:
X = train.drop(columns="TargetClass")
y = train["TargetClass"]

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
model = LogisticRegression(max_iter=500, multi_class='multinomial')
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(accuracy_score(y_val, y_pred))

0.9676874849288642


In [10]:
X_test = test.copy()
X_test_scaled = scaler.transform(X_test)

test_pred = model.predict(X_test_scaled)

res = pd.DataFrame({
    "index": range(len(test_pred)),
    "TargetClass": test_pred
})

res.to_csv("solutions/solutions.csv", index=False)
res.head()

Unnamed: 0,index,TargetClass
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
