In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import random
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats import pearsonr


- for each column that we check we transform it to dummies and call it X

- We train X on target y (the original target in the users df)

- we take the beta coefficients and run a LM that Y are the coef and X is the ORDER in the original X (we get a new coefficient a slope from the coefficient and call it B)

- in the bootsramp experiment:
    - for i in 1000: 
          sample a random order for the ORDER of the original X
          run a regression of that x on the coefficients from the prev step.
          save beta_i to a list
      
    - see where B falls in the beta distribution.


- in the pearson corr experiment:
    -check the spearman corr between the ORDER and the coefficients


- ROC curve between the models.

In [11]:
def find_numeric_columns_to_check(df, nunique_th):
    numeric_df = df.select_dtypes(include=[np.number])
    under_th_df = numeric_df.loc[:, numeric_df.nunique() < nunique_th]
    only_int_df = under_th_df.loc[:, (under_th_df.fillna(-9999) % 1 == 0).all()]
    more_than_two_values = only_int_df.loc[:, only_int_df.nunique() > 2]
    return list(more_than_two_values.columns)


In [12]:
def get_dummies(columns):
    dummies = {}
    for column in columns:
        dummy_variable = pd.get_dummies(data[column])
        prefix = f"{column}_"
        dummy_variable = dummy_variable.add_prefix(prefix)
        dummies[column] = dummy_variable
        data[dummy_variable.columns] = dummy_variable
    return dummies

In [45]:
data_folder = "/home/dani/Desktop/school/tabular_data_science_project_gil/tabular_data_science_project/datasets/converted_datasets"


# adult  

In [64]:
data = pd.read_csv(f'{data_folder}/adult_converted.csv')
target = 'income'

# spotify

In [69]:
data = pd.read_csv(f'{data_folder}/spotify_converted.csv')
target = 'mode'

  exec(code_obj, self.user_global_ns, self.user_ns)


# Titanic

In [73]:
data = pd.read_csv(f'{data_folder}/video_games_sales_converted.csv')
target = 'Global_Sales'

# Video games

In [77]:
data = pd.read_csv(f'{data_folder}/titanic_converted.csv')
target = 'Survived'

# get dummies

In [78]:
columns_to_check = find_numeric_columns_to_check(data, 50)
data = data.select_dtypes(exclude=['object'])
data = data.dropna()
y = data[target]
dummies = get_dummies(columns_to_check)

# bootstramp experiment

In [79]:
for col_name in dummies:
    X = dummies[col_name]
    original_column = data[col_name]
    model = LinearRegression().fit(X, y)
    coefs = model.coef_
    order = sorted(original_column.unique().reshape(-1, 1))
    B = LinearRegression().fit(order, coefs).coef_
    hist = []
    for i in range(1000):
        random.shuffle(order)
        beta_i = LinearRegression().fit(order, coefs).coef_
        hist.append(beta_i)
    hist = [float(arr) for arr in hist]
    ecdf = ECDF(hist)
    prob = 1 - ecdf(B)
    print(f"The probability of sampling a value greater than or equal to {float(B)} is {float(prob)}")
    if prob < 0.05:
        print(f"{col_name} is Ordinal")
    else:
        print(f"{col_name} is Nominal")

The probability of sampling a value greater than or equal to -0.19531249999999992 is 0.841
Pclass is Nominal
The probability of sampling a value greater than or equal to -0.08314732142857142 is 0.981
SibSp is Nominal
The probability of sampling a value greater than or equal to -0.08175223214285712 is 0.951
Parch is Nominal
The probability of sampling a value greater than or equal to -0.22231445312499998 is 0.929
Embarked is Nominal


# pearson experiment

In [80]:
for col_name in dummies:
    X = dummies[col_name]
    original_column = data[col_name]
    model = LinearRegression().fit(X, y)
    coefs = model.coef_
    order = sorted(original_column.unique())
    corr, _ = pearsonr(order, coefs)
    if abs(corr) >= 0.5:
        print(f"spearman r: {corr}, {col_name} is Ordinal")
    else:
        print(f"spearman r: {corr}, {col_name} is Nominal")
print()

spearman r: -0.9867313369649982, Pclass is Ordinal
spearman r: -0.8114091663843725, SibSp is Ordinal
spearman r: -0.6908533033732353, Parch is Ordinal
spearman r: -0.888457306197265, Embarked is Ordinal



