In [2]:
import numpy as np
import pandas as pd
from math import isnan
from sklearn import preprocessing
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
import json
def importFile(file_path):

    #TODO : ability to set other parameters when loading data through constructor
    
    if file_path.endswith('.csv') or file_path.endswith('.txt'):
        with open(file_path, 'r') as f:
            first_line = f.readline()
        if '\t' in first_line:
            return pd.read_csv(file_path, sep='\t')
        elif ';' in first_line:
            return pd.read_csv(file_path, sep=',')
        elif ',' in first_line:
            return pd.read_csv(file_path, sep=',')
        else:
            return pd.read_fwf(file_path)

    elif file_path.endswith('.json'):
        with open(file_path) as json_file:
            data = json.load(json_file)
        return pd.DataFrame(data)

    elif file_path.endswith('.xlsx'):
        excel_file = pd.ExcelFile(file_path)
        sheets = excel_file.sheet_names
        number_of_sheets = len(sheets)
        print(f'found {number_of_sheets} sheets ')
        if number_of_sheets == 1:
            return pd.read_excel(file_path, sheet_name=sheets[0])
        else:
            df = []
            for i in range(number_of_sheets):
                df.append(pd.read_excel(file_path, sheet_name=sheets[i]))
            return df 

    # TODO connecting to database and load sql tables in dataframe
    elif file_path.endswith('.sql'):
        return pd.read_sql(file_path)

    else:
        # TODO return a message such that it doesn't abruptly stop the program (error handling)
        return None


In [14]:
df = importFile('./datasets/MarketSegmentation_Train.csv')
df.head(5)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


## encoding

In [29]:
#TODO ask if want to convert categorical to numerical, else find a way to impute categorical
# categorical to numerical
df["Gender"] = df["Gender"].astype("category")
df["Gender_encoded"] = df["Gender"].cat.codes

In [30]:
df.head(5)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,Gender_encoded
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D,1
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A,0
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B,0
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B,1
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A,0


## handling missing values

In [15]:
# Handling missing values

df.isna().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [19]:
# mean imputation
df.mean()

  df.mean()


ID                 463479.214551
Age                    43.466906
Work_Experience         2.641663
Family_Size             2.850123
dtype: float64

In [20]:
df.median()

  df.median()


ID                 463472.5
Age                    40.0
Work_Experience         1.0
Family_Size             3.0
dtype: float64

In [22]:
df.mode().iloc[0]

ID                 458982
Gender               Male
Ever_Married          Yes
Age                  35.0
Graduated             Yes
Profession         Artist
Work_Experience       1.0
Spending_Score        Low
Family_Size           2.0
Var_1               Cat_6
Segmentation            D
Name: 0, dtype: object

In [None]:
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
most_frequent_imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputer to the data and transform it
df_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(df), columns=df.columns)
df_median_imputed = pd.DataFrame(median_imputer.fit_transform(df), columns=df.columns)
df_most_frequent_imputed = pd.DataFrame(most_frequent_imputer.fit_transform(df), columns=df.columns)

In [None]:
knnimputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(knnimputer.fit_transform(df), columns=df.columns)

## Scaling

In [None]:
# import pandas as pd
# from sklearn.preprocessing import MinMaxScaler

# # Load the data set
# df = pd.read_csv("data.csv")

# # Create the rescaler object
# scaler = MinMaxScaler()

# # Fit the rescaler to the data and transform it
# df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
