In [1]:
#!pipenv install blackcellmagic --dev
#!pipenv install seaborn

In [2]:
%load_ext blackcellmagic

In [3]:
import pandas
df = pandas.read_csv("train.csv")

In [4]:
import seaborn
import matplotlib.pyplot as plt
import numpy as np
import re
from scipy.stats import shapiro
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import Imputer

In [24]:
class CrawtoML:
    """
    """

    def __init__(self, data, target, features="infer", problem="infer",imputer='most_frequent'):
        self.data = data
        self.target = target
        if features == "infer":
            self.features = list(self.data.columns)
            self.features.remove(self.target)
        # TODO elif
        if problem in ["classification", "regression"]:
            self.problem = problem
        else:
            # TODO infer
            raise Exception("problem=infer not implemented")
        self.imputer = imputer
        self.numeric_columns = self.numerics()
        #self.categorical_features = self.categorical_features()
        self.problematic_columns = self.column_parser()
        self.nan_columns()
        self.imputation()

    def numerics(self):
        numerics = []
        for i in self.features:
            if self.data[i].dtypes in ["int64", "float64"]:
                numerics.append(i)
        self.numeric_columns = numerics
        return self.numeric_columns
    
    def nan_columns(self):
        self.nan_columns = []
        df = pandas.DataFrame(self.data.isna().sum())
        for i in self.features:
            if df.loc[i][0] > 0:
                self.nan_columns.append(i)
                self.features.remove(i)
        
    
    def imputation(self):
        #http://www.stat.columbia.edu/~gelman/arm/missing.pandasf
        #http://www.feat.engineering/imputation-methods.html#fnref74
        self.imputed_columns = []
        if self.imputer in ['mean','median', 'most_frequent']:
            imputer = Imputer(strategy=self.imputer)
            self.imputed_columns = imputer.fit_transform(self.data[self.nan_columns])
        
            
    def categorical_features(self,threshold=10):
        categorical_features = []
        for i in self.features:
            if len(unique_labels(self.data[i])) < threshold:
                categorical_features.append(i)
        return categorical_features

    def other_types(self):
        others = [i for i in self.features if i not in self.numeric_columns]
        self.other_types = others
        return self.other_types

    def __repr__(self):
        return "Target Column: %s \n \
        Feature columns: %s\n \
        Numeric Columns: %s"(
            self.target, self.features, self.numeric_columns
        )

    def column_parser(self):
        problematic_columns = []
        for i in self.features:
            if "Id" in i:
                problematic_columns.append(i)
            elif "ID" in i:
                problematic_columns.append(i)

    def correlation_report(self):
        seaborn.heatmap(self.data[self.numeric_columns].corr())

    def target_distribution_report(self):
        if self.problem == "regression":
            print(seaborn.distplot(self.data[self.target]))
        elif self.problem == "classification":
            print(seaborn.countplot(self.data[self.target]))

    def numeric_columns_distribution_report(self):
        self.distribution_r()
#         print(
#             seaborn.PairGrid(
#                 self.data, x_vars=self.numeric_columns, y_vars=self.target
#             ).map(seaborn.distplot)
#         )

    def distribution_r(self):
            display(pandas.DataFrame(
                [
                    self.distribution_fit(self.data, i)
                    for i in self.numeric_columns + [self.target]
                ],
                index=self.numeric_columns + [self.target],
            ))

    def distribution_fit(self, data, numeric_column):
        """
        x is a column_name
        """
        shapiro_values = shapiro(data[numeric_column])
        test_indication = True if shapiro_values[1] > 0.05 else False

        distribution_types = ["norm", "expon", "logistic", "gumbel"]
        # anderson_values = anderson(automl.data[numeric_column], dist=i)

        return {
            "Shapiro-Wilks_Test_Statistic": shapiro_values[0],
            "Shapiro-Wilks_p_Value": shapiro_values[1],
            "Normal distribution ?": test_indication
            # "Anderson_Darling_Test_Statistic_Normal": anderson_values[0][0],
        }
    def nan_report(self):
        display(pandas.DataFrame(round((self.data.isna().sum()/self.data.shape[0])*100,2),columns=["Percent of data encoded NAN"]))

    def correlation_report(self,threshold=0.95):
        corr_matrix=self.data[[self.target]+self.numeric_columns].corr()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        highly_correlated_features = [column for column in upper.columns if any(upper[column] > threshold)]
        seaborn.heatmap(corr_matrix)
        if len(highly_correlated_features)>0:
            print(f'Highly Correlated features are {highly_correlated_features}')
        else:
            print("No Features are correlated above the threshold")
# """Saved patterns"""
#  print(seaborn.PairGrid(self.data, x_vars=self.numeric_columns, y_vars=self.target).map(
#             seaborn.boxplot
#         ))

In [25]:
automl = CrawtoML(
    data=df,
    target="Survived",
    problem='classification'
)



ValueError: could not convert string to float: 'C85'

In [None]:
df = pd.DataFrame(self.data.isna().sum())

for i in self.features:
    if df.loc[i][0] > 0:
        print(i)

In [None]:
automl.nan_report()

In [None]:
automl.t()

In [None]:
automl.target_distribution_report()

In [None]:
automl.numeric_columns_distribution_report()

In [None]:
automl.nan_report()

In [None]:
automl.correlation_report()

# Experiments

In [None]:
from scipy.stats import anderson

In [None]:
distribution_types = ['norm','expon','logistic','gumbel']

In [None]:
l = []
for i in distribution_types:
    l.append(anderson(automl.data[automl.numeric_columns[0]], dist=i))

In [None]:
l

In [None]:
l[0][0]

In [None]:
def distribution_fit(data, numeric_column):
        from scipy.stats import shapiro

        """
        x is a column_name
        """
        shapiro_values = shapiro(data[numeric_column])
        test_indication = True if shapiro_values[1] > 0.05 else False

        distribution_types = ["norm", "expon", "logistic", "gumbel"]
        # anderson_values = anderson(automl.data[numeric_column], dist=i)

        return {
            "Shapiro_Wilks_Test_Statistic": shapiro_values[0],
            "Shapiro_Wilks_p_Value": shapiro_values[1],
            "Does the test indicate the data is normally distributed": test_indication
            # "Anderson_Darling_Test_Statistic_Normal": anderson_values[0][0],
        }

In [None]:
distribution_fit(automl.data,automl.numeric_columns[0])

In [None]:
from scipy.stats import probplot
import matplotlib.pyplot as plt
probplot(automl.data[automl.numeric_columns[3]], plot=plt)
plt.show();

In [None]:
automl.numeric_columns[0]