In [3]:
import pandas as pd
import numpy as np
import logisticregression as lr

In [4]:
class Feature:
    def __init__(self, name, dataset):
        self.name = name
        self.sorted_dataset = sorted(dataset)
        self.count = len(dataset)
        if self.count != 0:
            self.mean = sum(dataset) / self.count
            self.std = self.calc_std(dataset)
            self.min = self.sorted_dataset[0]
            self.p_25 = self.calc_percentiles(25)
            self.p_50 = self.calc_percentiles(50)
            self.p_75 = self.calc_percentiles(75)
            self.max = self.sorted_dataset[-1]
            self.infos = [self.count, self.mean, self.std, self.min, self.p_25, self.p_50, self.p_75, self.max]
        else:
            self.infos = [self.count]
            self.infos.extend([np.nan] * 7)
    
    def calc_std(self, dataset):
        # sum_squares = 0
        dataset = dataset.reshape(-1,1).T
        # print(f"{dataset.reshape(-1,1).T.shape = }")
        sum_squares = (dataset - self.mean) ** 2
        # for i in range(len(dataset)):
        #     sum_squares += (dataset[i] - self.mean) ** 2
        std = sum_squares / (self.count - 1)
        std = std ** 0.5
        return std
    
    def calc_percentiles(self, quartile):
        position_floaty = (float(quartile) / 100) * (self.count - 1)
        min_position = int(position_floaty)
        max_position = min_position + 1
        max_coef = position_floaty - min_position
        if max_coef == 0.0:
            return self.sorted_dataset[min_position]
        min_coef = 1 - max_coef
        result_min = (self.sorted_dataset[min_position] * min_coef)
        result_max = (self.sorted_dataset[max_position] * max_coef)
        return result_min + result_max 

In [5]:

def parse_arguments(args):
    ### TODO : parse arguments
    dataset_name = "datasets/dataset_train.csv"
    list_params = ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
    return(dataset_name, list_params)


class Dataset:
    def __init__(self, datafile, list_params):
        if datafile:
            self.read_csv(datafile)
        self.list_params = list_params
    
    def read_csv(self, dataset_name):
        try:
            df = pd.read_csv(dataset_name)
            self.features = list(df.select_dtypes(exclude=['object']).columns)
            self.X = df[self.features].to_numpy()
            self.X = self.X.T
            one_hot_encoding = pd.get_dummies(df["Hogwarts House"], drop_first = False)
            self.houses = list(one_hot_encoding.columns)
            self.y = one_hot_encoding.to_numpy()
            self.input_df = df
        except FileNotFoundError:
            print(f"No such file or directory: '{dataset_name}'")
        except pd.errors.EmptyDataError:
            print(f"No columns to parse from file: '{dataset_name}'")
        return (None)
    
    def init_describe_df(self):
        self.output_df = pd.DataFrame(data = None, index = self.list_params)

    
    def clear_empty_values(self, X):
        X = X[~np.isnan(X)]
        return (X)

    def fill_output_df(self):
        for index, feature_name in enumerate(self.features):
            self.clear_empty_values(self.X[index])
            feature = Feature(feature_name, self.X[index])
            self.output_df[feature_name] = feature.infos

In [6]:
dataset_name, list_params = parse_arguments(None)
data = Dataset(dataset_name, list_params)
print(len(data.features))
print(data.houses)
print(data.X.shape)
data.init_describe_df()
data.fill_output_df()
#     display(output_df)

14
['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin']
(14, 1600)


In [7]:
data.output_df

Unnamed: 0,Index,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
count,1600,1600,1600,1600,1600,1600,1600,1600,1600,1600,1600,1600,1600,1600
mean,799.5,,,,,,,,,,,,-243.374409,21.958012
std,"[[19.993749023132207, 19.968741206968186, 19.9...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[0.2645916731339085, 0.22031488384962863, 0.4...","[[1.2215821165776415, 3.3862586837350195, 0.21..."
min,0.0,-2464.0,-834.475335,-10.295663,-7.413409,-7.964,-926.892512,298.21896,-7.322486,907.190274,-1.404884,-2.002451,-261.04892,-181.47
25%,399.75,53711.75,,4.939898,-6.24599,5.8715,,600.838521,,963.123324,7.230894,,-250.6526,-41.87
50%,799.5,69534.5,-625.394544,6.370291,,1.75,-906.934757,353.044548,,930.770252,2.669934,-2.680301,-244.867765,-2.515
75%,1199.25,54775.5,487.167177,5.165495,,-8.55325,-721.768614,343.807317,,,,0.263624,-232.552305,50.56
max,1599.0,82713.0,723.816839,8.46534,8.17917,8.445,,665.216678,7.208415,1087.949205,12.3826,,-225.42814,279.07


In [63]:
test = pd.read_csv("datasets/test.csv", index_col=None)

In [65]:
features = list(test.columns[6:]) ### TODO: rajouter option features
X = test[features].to_numpy()

In [64]:
test.head()

Unnamed: 0,Index,Hogwarts House,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Unnamed: 14,Potions,Care of Magical Creatures,Charms,Flying
0,0,Ravenclaw,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,,3.790369,0.715939,-232.79405,-26.89
1,1,Slytherin,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,,7.248742,0.091674,-252.18425,-113.45
2,2,Ravenclaw,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,,8.728531,-0.515327,-227.34265,30.42
3,3,Gryffindor,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,,0.821911,-0.01404,-256.84675,200.64
4,4,Gryffindor,Gibbs,1998-09-27,Left,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,,4.311066,-0.26407,-256.3873,157.98


In [69]:
X[8]

array([ 496.39494499,   -5.21589115,   -4.96394945,    5.855     ,
       -626.55204113,  567.84240159,   -6.19866123,  925.25550039,
                 nan,    1.08651782,    1.14703153, -252.27561   ,
        244.11      ])

In [68]:
for i, f in enumerate(features):
    print(i, f)

0 Astronomy
1 Herbology
2 Defense Against the Dark Arts
3 Divination
4 Muggle Studies
5 Ancient Runes
6 History of Magic
7 Transfiguration
8 Unnamed: 14
9 Potions
10 Care of Magical Creatures
11 Charms
12 Flying
