In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer

In [2]:
df=pd.read_csv('adult.csv')

In [3]:
df=df.rename(columns={"native-country": "native_country"})

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native_country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


პირველ რიგში განვსაზღვროთ ამოცანის ტიპი და საპროგნოზო ცვლადი

In [5]:
problem_type = 'Classification'
# problem_type = 'Regression'
target='income'

გადავყაროთ უსახელო სვეტები, ამოვიღოთ დაშორებები სვეტების სახელებს შორის, გადავყაროთ დუბლიკატები, გადავაგდოთ id სვეტი. ამისთვს შემოვიღოთ შემდეგი ფუნქციები:

In [6]:
#1
def drop_unnamed_column(X):
    for column in X.columns:
        if column.startswith('Unnamed: 0'):
            X = X.drop(columns=column)
            print('Dropped with Unnamed - ', column)
    return X

In [7]:
#2
def remove_spaces(X):

    for column in list(X.select_dtypes(include=['object']).columns):
        X[column] = X[column].astype(str).str.strip()

    return X

In [8]:
#3
def drop_dublicates(X):

    X.drop_duplicates(keep='first', inplace=True)
    X.dropna(axis='index', how='all', inplace=True)
    X.dropna(axis='columns', how='all', inplace=True) 
    return X

In [9]:
#4
def drop_id_column(X):
    for column in X.columns:
        if ((column.lower() == 'id' or column.lower().endswith(' id') or column.lower().endswith('_id') or column.lower().endswith('-id')) and X[column].nunique()==X.shape[0]):
            X = X.drop(columns=column)
            print('Droped with ID', column)
    return X

In [10]:
#1
df=drop_unnamed_column(df)

#2
df=remove_spaces(df)

#3
df=drop_dublicates(df)

#4
df=drop_id_column(df)

საპროგნოზო ცვლადის გარდაქმნა მოვახდინოთ

In [11]:
#5
def target_preprocessing(df, target, problem_type):

    # drop records when target is missing
    df = df[df[target].notnull()]
    le = 'None'
    if problem_type != 'Regression':
        le = LabelEncoder()
        df[target] = le.fit_transform(df[target].astype(str))       
    return df, le

In [12]:
df, le = target_preprocessing(df, target, problem_type)

შევცვალოთ ძირითადი ცარიელი მნიშვნელობები Nan–ით

In [13]:
# 6)

def replace_most_pop_miss(X):

    if len(list(X.select_dtypes(include=['object']).columns)) > 0:
        X.replace(dict.fromkeys(['?', 'n/a', 'nan', 'NaN', ',', 'N/a', 'N/A', 'none', '*', '.'], np.nan), inplace=True)


        X = X.replace(r'^\s*$', np.nan, regex=True)

    return X

In [14]:
df=replace_most_pop_miss(df)

განვსაზღვროთ სვეტების ტიპები

In [15]:
def check_columns_type_with_Nan_replace(X, column):
            # need to define int column with NaN(should be float type)
            X[column] = X[column].fillna(0)
            count_int = 0
            lenth = len(X[column])


            for i in X[column]:
                try:
                    if i%1 == 0:
                        count_int +=1
                except:
                    continue
            if count_int == lenth:
                
                X[column]=X[column].astype(int)
            return X[column]

In [16]:
def define_columns_type(X):
    high_card_features_ls = []
    cat_features_ls = []
    num_features_ls = []
    
    X = pd.DataFrame.copy(X)
    for column in X.columns:     
            # if number of unique values more than 63, that column should be categorical
            # if column numerical
            if (X[column].dtype in [int, float]):
                
                
                # try to replace Nan and check column type
                X[column] = check_columns_type_with_Nan_replace(X, column)
                #check if column 'int'
                if X[column].dtype in [int]:
                    if X[column].nunique()>60:
                        num_features_ls.append(column)
                    else:
                        cat_features_ls.append(column)

                else:
                    num_features_ls.append(column)

            # check object columns   
            elif (X[column].nunique()>60):
                num_features_ls.append(column)
            else:
                cat_features_ls.append(column)

    return high_card_features_ls, cat_features_ls, num_features_ls

In [17]:
high_card_features_ls, cat_features_ls, num_features_ls =  define_columns_type(df.drop(columns=target))

In [18]:
num_features_ls

['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

In [19]:
cat_features_ls

['workclass',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'native_country']

კატეგორიული ცვლადების ენკოდირება

In [20]:
df = pd.get_dummies(df, columns = cat_features_ls)

In [21]:
df.head()

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,25,226802,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,0,0,50,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,0,0,40,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,7688,0,40,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,18,103497,0,0,30,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


გავხლიჩოთ მონაცემები საწვრთნელ და სატესტო ნაწილებად

In [22]:
from sklearn.model_selection import train_test_split, StratifiedKFold

In [97]:
x = df.drop(columns=target)
y = df[target]
        
if problem_type!='Regression':
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = True,
                                                        random_state=42, stratify=y)
elif problem_type=='Regression':
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size_in, shuffle = True, 
                                                                random_state=42)

In [98]:
x_train.isna().sum().sum()

0

გადავყაროთ ისეთი სვეტები სადაც 70% არის ცარიელი მნიშვნელობა

In [99]:
def more_than_seventy_percent_of_nan(X):
    droped_columns = []
    for col in list(X.columns):
            if X[col].isnull().mean() > 0.7:
                droped_columns.append(col)
    X =  X.drop(columns=droped_columns, axis=1)
    return X, droped_columns

In [100]:
x_train, droped_columns=more_than_seventy_percent_of_nan(x_train)
if len(droped_columns)== 0:
    x_test=x_test
else:    
    x_test=x_test.drop(droped_columns, axis=1, inplace=True)

ვარიაციის ზღვარი შემოვიტანოთ. თუ ვარიანსის ზღვარი არის 0 მაშნ გადაიყრება ისეთი სვეტები სადაც ყველა მნიშვნელობა არის ერთი და იგივე

In [101]:
from sklearn.feature_selection import VarianceThreshold

In [102]:
def variance(df):
    var_thres=VarianceThreshold(threshold=0)
    df1=df._get_numeric_data()
    var_thres.fit(df1)
    constant_columns = [column for column in df1.columns
                    if column not in df1.columns[var_thres.get_support()]]
    df=df.drop(constant_columns,axis=1)
    return df, constant_columns

In [103]:
x_train, constant_columns=variance(x_train)
if len(constant_columns)== 0:
    x_test=x_test
else:    
    x_test=x_test.drop(constant_columns, axis=1, inplace=True)

განვსაზღვროთ სვეტების ტიპები

In [104]:
x_train.shape

(39032, 120)

In [105]:
x_test.shape

(9758, 120)

შევავსოთ ცარიელი მნიშვნელობები 

In [106]:
#რიცხვითი ცვლადები შევავსეთ მედიანური მნიშვნელობით
x_train[num_features_ls] = x_train[num_features_ls].fillna(value=x_train[num_features_ls].median())
x_test[num_features_ls] = x_test[num_features_ls].fillna(value=x_test[num_features_ls].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [107]:
# x_train[cat_features_ls] = x_train[cat_features_ls].fillna('unknown')
# x_test[cat_features_ls] = x_test[cat_features_ls].fillna('unknown')

In [108]:
def correlation(df, threshold):
    droped_columns = []  
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if (corr_matrix.iloc[i, j] >= 0.8) and (corr_matrix.columns[j] not in droped_columns):
                    colname = corr_matrix.columns[i] # getting the name of column
                    droped_columns.append(colname)
    df =  df.drop(columns=droped_columns, axis=1)
    return df, droped_columns

In [109]:
x_train, droped_columns=correlation(x_train, 0.8)
if len(droped_columns)== 0:
    x_test=x_test
else:    
    x_test=x_test.drop(columns=droped_columns, axis=1)

In [111]:
x_train.shape

(39032, 103)

In [None]:
def encode(X):
    num_features_ls = num_features_ls
    encoder = LabelEncoder()
    column_encoder_dict = {}
    
    for column in X.drop(num_features_ls, axis=1).columns:
            encoder = LabelEncoder()
            encoder.fit(X[column])
            column_encoder_dict[column] = encoder
    
    new_ls = list(set(X.columns)&set(num_features_ls))
    
    for column in X.drop(new_ls, axis=1).columns:
            X[column]= column_encoder_dict[column].transform(X[column])


    return X

In [None]:
encode(df)

In [None]:
def get_chars(df):
    concated_string = ''.join(df.columns)
    chars_to_remove = re.findall(r'[^a-z, 0-9,  ,_]', concated_string)
    chars_to_remove = set(chars_to_remove)
    return chars_to_remove

In [112]:
# გარდაქმნილი მონაცემები
train_data = pd.concat([x_train, y_train], axis=1).reset_index(drop=True)
test_data = pd.concat([x_test, y_test], axis=1).reset_index(drop=True)