# Linear Regression Alternative Solution 

In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import random
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings("ignore")

### Solution Details:
- We detected columns to check by searching numeric columns with less than 50 unique INT values (same as in the final solution).

- For each column that we check we transform it to dummies and call it X.

- We train X on target y (the original target in the users df).

- We then extract the beta coefficients and check the spearman correlation between the order of the coefficients and the order of the original values.

- We determine each column type by the columns spearman correlation, columns with score of 0.4 and above will be ordinal and columns with score under 0.4 will be nominal. 

In [84]:
def find_numeric_columns_to_check(df, nunique_th):
    numeric_df = df.select_dtypes(include=[np.number])
    under_th_df = numeric_df.loc[:, numeric_df.nunique() < nunique_th]
    only_int_df = under_th_df.loc[:, (under_th_df.fillna(-9999) % 1 == 0).all()]
    more_than_two_values = only_int_df.loc[:, only_int_df.nunique() > 2]
    return list(more_than_two_values.columns)

def get_dummies(columns, data):
    dummies = {}
    for column in columns:
        dummy_variable = pd.get_dummies(data[column])
        prefix = f"{column}_"
        dummy_variable = dummy_variable.add_prefix(prefix)
        dummies[column] = dummy_variable
        data[dummy_variable.columns] = dummy_variable
    return dummies

def train_lr_model(X, y):
    model = LinearRegression().fit(X, y)
    coefs = model.coef_
    return coefs

In [85]:
def main(data, target):
    columns_to_check = find_numeric_columns_to_check(data, 50)
    data = data.dropna()
    y = data[target]
    dummies = get_dummies(columns_to_check, data)
    
    for col_name in dummies:
        coefs = train_lr_model(dummies[col_name], y)
        original_column = data[col_name]
        order = sorted(original_column.unique())
        corr, _ = pearsonr(order, coefs)
        if abs(corr) >= 0.4:
            print(f"spearman r: {round(corr,2)}, {col_name} is Ordinal")
        else:
            print(f"spearman r: {round(corr,2)}, {col_name} is Nominal")

In [86]:
data_folder = "../datasets/converted_datasets"

# Adult  

In [87]:
data = pd.read_csv(f'{data_folder}/adult_converted.csv')
target = 'income'
main(data, target)

spearman r: 0.77, education is Ordinal
spearman r: 0.89, educational-num is Ordinal
spearman r: 0.04, workclass is Nominal
spearman r: -0.49, marital-status is Ordinal
spearman r: 0.16, occupation is Nominal
spearman r: -0.15, native-country is Nominal
spearman r: 0.34, relationship is Nominal


# Spotify

In [88]:
data = pd.read_csv(f'{data_folder}/spotify_converted.csv', low_memory=False)
target = 'mode'
main(data, target)

spearman r: -0.43, key is Ordinal
spearman r: -0.98, time_signature is Ordinal
spearman r: 0.1, genre is Nominal


# Titanic

In [89]:
data = pd.read_csv(f'{data_folder}/video_games_sales_converted.csv')
target = 'Global_Sales'
main(data, target)

spearman r: -0.4, Year_of_Release is Ordinal
spearman r: 0.2, Platform is Nominal
spearman r: 0.06, Genre is Nominal


# Video Games

In [90]:
data = pd.read_csv(f'{data_folder}/titanic_converted.csv')
target = 'Survived'
main(data, target)

spearman r: -0.34, Pclass is Nominal
spearman r: 0.07, SibSp is Nominal
spearman r: -0.78, Parch is Ordinal
spearman r: -0.81, Embarked is Ordinal


# Final Results

Out of 17 columns, this model successfully calssidfied 13 columns. \
Therefore receives an <b> accuracy score of 12/17 = 0.7 </b>.