In [1]:
import pandas as pd
import numpy as np
import logisticregression as lr

In [2]:
class Feature:
    def __init__(self, name, dataset):
        self.name = name
        self.sorted_dataset = sorted(dataset)
        self.count = len(dataset)
        if self.count != 0:
            self.mean = sum(dataset) / self.count
            self.std = self.calc_std(dataset)
            self.min = self.sorted_dataset[0]
            self.p_25 = self.calc_percentiles(25)
            self.p_50 = self.calc_percentiles(50)
            self.p_75 = self.calc_percentiles(75)
            self.max = self.sorted_dataset[-1]
            self.infos = [self.count, self.mean, self.std, self.min, self.p_25, self.p_50, self.p_75, self.max]
        else:
            self.infos = [self.count]
            self.infos.extend([np.nan] * 7)
    
    def calc_std(self, dataset):
        sum_squares = 0
        for i in range(len(dataset)):
            sum_squares += (dataset[i] - self.mean) ** 2
        std = sum_squares / (self.count - 1)
        std = std ** 0.5
        return std
    
    def calc_percentiles(self, quartile):
        position_floaty = (float(quartile) / 100) * (self.count - 1)
        min_position = int(position_floaty)
        max_position = min_position + 1
        max_coef = position_floaty - min_position
        if max_coef == 0.0:
            return self.sorted_dataset[min_position]
        min_coef = 1 - max_coef
        result_min = (self.sorted_dataset[min_position] * min_coef)
        result_max = (self.sorted_dataset[max_position] * max_coef)
        return result_min + result_max 

In [22]:

def parse_arguments(args):
    ### TODO : parse arguments
    dataset_name = "datasets/dataset_train.csv"
    list_params = ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
    return(dataset_name, list_params)


class Dataset:
    def __init__(self, datafile, list_params):
        if datafile:
            self.read_csv(datafile)
        self.list_params = list_params
    
    def read_csv(self, dataset_name):
        try:
            df = pd.read_csv(dataset_name)
            self.features = list(df.select_dtypes(exclude=['object']).columns)
            self.X = df[self.features].to_numpy()
            one_hot_encoding = pd.get_dummies(df["Hogwarts House"], drop_first = False)
            self.houses = list(one_hot_encoding.columns)
            self.y = one_hot_encoding.to_numpy()
            self.input_df = df
        except FileNotFoundError:
            print(f"No such file or directory: '{dataset_name}'")
        except pd.errors.EmptyDataError:
            print(f"No columns to parse from file: '{dataset_name}'")
        return (None)
    
    def init_describe_df(self):
        self.output_df = pd.DataFrame(data = None, index = self.list_params)

    
    def clear_empty_values(X):
        X = X[~np.isnan(X)]
        return (X)

    def fill_output_df(self):
        for index, feature_name in self.features:
            self.clear_empty_values(self.X[index])
            feature = Feature(feature_name, X[index])
            output_df[feature_name] = feature.infos

In [23]:
dataset_name, list_params = parse_arguments(None)
data = Dataset(dataset_name, list_params)
print(len(data.features))
print(data.houses)
print(data.X.shape)
data.init_describe_df()
fill_output_df(input_df, output_df, list_params)
#     display(output_df)

14
['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin']
(1600, 14)


NameError: name 'fill_output_df' is not defined

In [3]:
in

In [67]:
test = pd.read_csv(dataset_name, index_col=None)

In [68]:
features = list(test.select_dtypes(exclude=['object']).columns)
print(features)

['Index', 'Arithmancy', 'Astronomy', 'Herbology', 'Defense Against the Dark Arts', 'Divination', 'Muggle Studies', 'Ancient Runes', 'History of Magic', 'Transfiguration', 'Potions', 'Care of Magical Creatures', 'Charms', 'Flying']


In [60]:
test.head()

Unnamed: 0,Index,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,0,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,1,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,2,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42
3,3,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.01404,-256.84675,200.64
4,4,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.26407,-256.3873,157.98


In [46]:
test.describe()

Unnamed: 0,Index,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
count,1600.0,1566.0,1568.0,1567.0,1569.0,1561.0,1565.0,1565.0,1557.0,1566.0,1570.0,1560.0,1600.0,1600.0
mean,799.5,49634.570243,39.797131,1.14102,-0.387863,3.15391,-224.589915,495.74797,2.963095,1030.096946,5.950373,-0.053427,-243.374409,21.958012
std,462.02453,16679.806036,520.298268,5.219682,5.212794,4.155301,486.34484,106.285165,4.425775,44.125116,3.147854,0.971457,8.78364,97.631602
min,0.0,-24370.0,-966.740546,-10.295663,-10.162119,-8.727,-1086.496835,283.869609,-8.858993,906.62732,-4.697484,-3.313676,-261.04892,-181.47
25%,399.75,38511.5,-489.551387,-4.308182,-5.259095,3.099,-577.580096,397.511047,2.218653,1026.209993,3.646785,-0.671606,-250.6526,-41.87
50%,799.5,49013.5,260.289446,3.469012,-2.589342,4.624,-419.164294,463.918305,4.378176,1045.506996,5.874837,-0.044811,-244.867765,-2.515
75%,1199.25,60811.25,524.771949,5.419183,4.90468,5.667,254.994857,597.49223,5.825242,1058.43641,8.248173,0.589919,-232.552305,50.56
max,1599.0,104956.0,1016.21194,11.612895,9.667405,10.032,1092.388611,745.39622,11.889713,1098.958201,13.536762,3.056546,-225.42814,279.07
