In [17]:
"""Script for plotting a bar chart with types of variable stars in
the current version of the General Catalog of Variable Stars (GCVS).
Data sources:
http://www.sai.msu.su/gcvs/gcvs/gcvs5/gcvs5.txt
https://cdsarc.cds.unistra.fr/viz-bin/cat/B/vsx

According to GCVS Variability Types description,
http://www.sai.msu.su/gcvs/gcvs/vartype.htm
if a variable belongs to several types of variability, the types are joined
in the data field by a "+" sign, e.g., E+UG, UV+BY.
Multiple classifications for object types are separated by a solidus ("/").
We collect them separatly in additional data arrays.
Uncertainty on type of variability marked with a colon (:) is discarded for simplicity.

According to variable star type designations in vsx,
https://www.aavso.org/vsx/index.php?view=about.vartypes
A colon (:) after the variability type -or any other field- means
the value/classification is uncertain.
A pipe character (|) between two different types signifies a logical OR;
the classification is uncertain and all possible types are indicated.
An example of this is ELL|DSCT, where the star may be an ellipsoidal binary system
or a DSCT-type pulsating variable with half the given period.
A plus character (+) signifies a logical AND; two different variability types
are seen in the same star or system. An example of this would be ELL+DSCT, where
one of the components of an ellipsoidal binary system is a DSCT-type pulsating variable.
A slash character (/) indicates a subtype. In the case of binary systems (eclipsing,
ellipsoidal or reflection variables) it is used to help describe either the physical
properties of the system (E/PN or EA/RS), the luminosity class of the components (EA/DM),
or the degree of filling of their inner Roche lobes (EA/SD).
This is the GCVS classification system. In cataclysmic variables, slash characters
are used to indicate some properties of the system, as in the degree of polarization
(NA/DQ) or the nature of their components (UG/IBWD).
"""


import os

from scour import scour
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import pandas as pd


def optimize_svg(tmp_path, path):
    """Optimize svg file using scour"""
    with open(tmp_path, "rb") as inputfile, open(path, "wb") as outputfile:
        options = scour.generateDefaultOptions()
        options.enable_viewboxing = True
        options.strip_comments = True
        options.strip_ids = True
        options.remove_metadata = True
        options.shorten_ids = True
        options.indent_type = "none"
        options.newlines = False
        scour.start(options, inputfile, outputfile)


def fill_dct(dct, typ):
    """Fill dictionary with given type name."""
    try:
        dct[typ] += 1
    except KeyError:
        dct[typ] = 1


types_dct = {}
types_plus = {}
types_slash = {}
STRIP = True

"""Read GCVS file, get each type of variable star, count them,
merge with uncertainly defined types if STRIP == True, collect in dictionaries.
"""
with open("../../../data/gcvs/gcvs5.txt", encoding="ascii") as gcvs:
    cat = gcvs.readlines()
    for line in cat:
        typ = line[41:51].strip()
        if STRIP:
            typ = typ.strip(":")
        if "+" in typ:
            for typsplit in typ.split("+"):
                if STRIP:
                    typsplit = typsplit.strip(":")
                fill_dct(types_plus, typsplit)
        if "/" in typ:
            for typsplit in typ.split("/"):
                if STRIP:
                    typsplit = typsplit.strip(":")
                fill_dct(types_slash, typsplit)
        fill_dct(types_dct, typ)

# df_gcvs = pd.DataFrame({
#         "gcvs": pd.Series(types_dct),
#         "+": pd.Series(types_plus),
#         "/": pd.Series(types_slash),
#     }).fillna(0).sort_values(by="gcvs")[NUM:]
gcvs = pd.Series(types_dct).sort_values(ascending=False)

vsx_data_raw = pd.read_csv("../../../data/vsx/vsx_csv.dat", usecols=["Type"])
vsx_types = vsx_data_raw["Type"].str.strip(":")
vsx = vsx_types.squeeze().value_counts()


In [30]:
K = 38.97 # 2268196/58202
df = pd.DataFrame({"vsx": vsx,
                     "gcvs": gcvs * K}).fillna(0).astype(int)

NUM = -39
df_gcvs = df.sort_values(by="gcvs")[NUM:]
df_vsx = df.sort_values(by="vsx")[NUM:]
print("GCVS sorted:")
print(df_gcvs[-10:])
print("VSX sorted:")
print(df_vsx[-10:])

ax = df_gcvs.plot.bar(figsize=(16, 9), width=0.88, rot=45)
# ax = df_vsx.plot.bar(figsize=(16, 9), width=0.88, rot=45)
ax.legend([
    "Типы переменных звезд VSX",
    "Типы переменных звезд ОКПЗ x39",
    ], fontsize=12, loc="upper left")

plt.subplots_adjust(left=0.051, bottom=0.102, right=0.985, top=0.955)
plt.xlabel("Типы переменных звезд", fontsize=14)
plt.ylabel("Количество переменных звезд", fontsize=14, labelpad=0)
plt.title("Распределение по типам переменных звезд в VSX и ОКПЗ, "
    + f"{sum(vsx)} и {sum(gcvs)} объектов. Июнь 2023 года",
    fontsize=15)

FILE_EXT = "png"
PLT_PTH = f"../../../plots/stars/var_types_distribution-gcvs-sorted"
# PLT_PTH = f"../../../plots/stars/var_types_distribution-vsx-sorted"
tmp_pth = f"{PLT_PTH}_.{FILE_EXT}"
pth = f"{PLT_PTH}.{FILE_EXT}"
plt.savefig(tmp_pth, dpi=120)
if FILE_EXT == "svg":
    optimize_svg(tmp_pth, pth)
    os.remove(tmp_pth)

GCVS sorted:
         vsx    gcvs
EB     29270   55142
RR      1888   63560
RRC    39838   65235
SRB     4007  110635
SR    327214  123963
EW    404075  134953
LB      3641  153931
EA    101690  157984
RRAB   99254  278401
M      82446  356185
VSX sorted:
         vsx    gcvs
BY     86947   48790
EC     94350      77
ROT    98141       0
RRAB   99254  278401
EA    101690  157984
SRS   119308    4364
VAR   147885      77
SR    327214  123963
E     338648   29071
EW    404075  134953


