# Sampling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

In [2]:
from imblearn.under_sampling import RandomUnderSampler

In [3]:
from sklearn.datasets import make_classification

In [4]:
cleanDummy = pd.read_csv('../Data/cleanDummy.csv')

In [5]:
cleanDummy.head()

Unnamed: 0.1,Unnamed: 0,totalIncome,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,genderR,ownsCarR,ownsRealtyR,eduLvlR,incomeTypeR,housingTypeR,famStatusR
0,0,427500,2,33,12,1,1,1,1,1,0,0,0,0
1,31,112500,2,59,3,1,1,1,1,1,1,0,1,1
2,61,270000,1,53,8,1,1,0,0,1,1,1,1,2
3,137,283500,1,62,0,0,0,0,0,1,0,2,1,3
4,188,270000,2,47,2,1,0,1,1,1,0,0,1,1


In [6]:
# Drop Unnamed:0 col
cleanDummy.drop('Unnamed: 0', axis=1, inplace=True) 

In [7]:
cleanDummy.head()

Unnamed: 0,totalIncome,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,genderR,ownsCarR,ownsRealtyR,eduLvlR,incomeTypeR,housingTypeR,famStatusR
0,427500,2,33,12,1,1,1,1,1,0,0,0,0
1,112500,2,59,3,1,1,1,1,1,1,0,1,1
2,270000,1,53,8,1,1,0,0,1,1,1,1,2
3,283500,1,62,0,0,0,0,0,1,0,2,1,3
4,270000,2,47,2,1,0,1,1,1,0,0,1,1


In [8]:
cleanDummy['genderR'].value_counts()  # 0=F, 1=M

0    8556
1    4750
Name: genderR, dtype: int64

In [9]:
# Undersample female
# X is dropped genderR col, y is only genderR

X = cleanDummy.drop('genderR', axis=1)  
y = cleanDummy['genderR']


In [10]:
y

0        1
1        1
2        0
3        0
4        1
        ..
13301    0
13302    1
13303    0
13304    0
13305    1
Name: genderR, Length: 13306, dtype: int64

In [11]:
# taking number minor class and scaling down major class to match minor class 
sampler = RandomUnderSampler(sampling_strategy='auto')


In [12]:
X_res, y_res = sampler.fit_resample(X, y)


In [13]:
sampler.fit_resample?

[1;31mSignature:[0m [0msampler[0m[1;33m.[0m[0mfit_resample[0m[1;33m([0m[0mX[0m[1;33m,[0m [0my[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Resample the dataset.

Parameters
----------
X : {array-like, dataframe, sparse matrix} of shape                 (n_samples, n_features)
    Matrix containing the data which have to be sampled.

y : array-like of shape (n_samples,)
    Corresponding label for each sample in X.

Returns
-------
X_resampled : {array-like, dataframe, sparse matrix} of shape                 (n_samples_new, n_features)
    The array containing the resampled data.

y_resampled : array-like of shape (n_samples_new,)
    The corresponding label of `X_resampled`.
[1;31mFile:[0m      c:\python3\lib\site-packages\imblearn\base.py
[1;31mType:[0m      method


In [14]:
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 4750, 1: 4750})


In [15]:
counter = Counter(y_res)
print(counter)

Counter({0: 4750, 1: 4750})


In [16]:
X_res, y_res

(      totalIncome  famSize  ageYrs  yrsEmpl  UNEMPLOYED  ApprStatus  ownsCarR  \
 0          225000        2      40        7           1           1         1   
 1          270000        2      41        1           1           1         1   
 2          360000        2      36        6           1           1         1   
 3          225000        3      37       10           1           1         0   
 4          135000        2      32        2           1           1         1   
 ...           ...      ...     ...      ...         ...         ...       ...   
 9495       112500        1      26        3           1           0         0   
 9496       157500        4      40       12           1           1         1   
 9497       112500        1      28        4           1           0         1   
 9498       202500        2      30        3           1           0         1   
 9499       202500        2      54        5           1           1         1   
 
       ownsRea

In [17]:
balancedMF = X_res, y_res

In [18]:
type(X_res)


pandas.core.frame.DataFrame

In [19]:
genderRcol= pd.Series.to_frame(y_res)

In [20]:
type(genderRcol)

pandas.core.frame.DataFrame

In [21]:
combinedResample = pd.concat( [X_res, genderRcol], axis = 1)

In [22]:
combinedResample

Unnamed: 0,totalIncome,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,ownsCarR,ownsRealtyR,eduLvlR,incomeTypeR,housingTypeR,famStatusR,genderR
0,225000,2,40,7,1,1,1,1,1,1,1,1,0
1,270000,2,41,1,1,1,1,0,1,1,1,1,0
2,360000,2,36,6,1,1,1,0,0,0,1,1,0
3,225000,3,37,10,1,1,0,1,1,0,1,1,0
4,135000,2,32,2,1,1,1,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9495,112500,1,26,3,1,0,0,1,1,0,0,2,1
9496,157500,4,40,12,1,1,1,1,1,0,1,1,1
9497,112500,1,28,4,1,0,1,1,1,0,1,2,1
9498,202500,2,30,3,1,0,1,1,1,0,1,0,1


In [23]:
type(balancedMF)

tuple

In [25]:
combinedResample.to_csv("../Data/balancedMF.csv")

In [26]:
# balance dataset that has occupation type for comparison analysis in Tableau
mergedCat = pd.read_csv('../Data/mergedOneRecord.csv')

In [27]:
mergedCat.head()

Unnamed: 0.1,Unnamed: 0,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,occupation,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus
0,0,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1
1,31,M,Y,Y,0,112500,Working,Secondary / secondary special,Married,House / apartment,Security staff,2,59,3,1,1
2,61,F,N,Y,0,270000,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,1,53,8,1,1
3,137,F,N,Y,0,283500,Pensioner,Higher education,Separated,House / apartment,Not identified,1,62,0,0,0
4,188,M,Y,Y,0,270000,Working,Higher education,Married,House / apartment,Accountants,2,47,2,1,0


In [28]:
# Drop Unnamed:0 col
mergedCat.drop('Unnamed: 0', axis=1, inplace=True) 

In [29]:
# Undersample female
# X is dropped genderR col, y is only genderR

X = mergedCat.drop('gender', axis=1)  
y = mergedCat['gender']

In [30]:
# taking number minor class and scaling down major class to match minor class 
sampler = RandomUnderSampler(sampling_strategy='auto')

In [31]:
X_res, y_res = sampler.fit_resample(X, y)


In [32]:
counter = Counter(y_res)
print(counter)

Counter({'F': 4750, 'M': 4750})


In [33]:
genderCol= pd.Series.to_frame(y_res)

In [34]:
type(genderCol)

pandas.core.frame.DataFrame

In [35]:
ResampleCategorical = pd.concat( [X_res, genderCol], axis = 1)

In [36]:
ResampleCategorical.head()

Unnamed: 0,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,occupation,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,gender
0,Y,Y,0,211500,Working,Secondary / secondary special,Married,House / apartment,High skill tech staff,2,43,14,1,0,F
1,N,Y,0,94500,Pensioner,Secondary / secondary special,Separated,House / apartment,Not identified,1,58,0,0,0,F
2,N,Y,0,225000,Working,Secondary / secondary special,Single / not married,House / apartment,Not identified,1,58,13,1,1,F
3,Y,Y,0,315000,Commercial associate,Incomplete higher,Single / not married,House / apartment,Core staff,1,25,3,1,0,F
4,N,N,0,90000,Pensioner,Secondary / secondary special,Widow,House / apartment,Not identified,1,60,0,0,1,F


In [39]:
ResampleCategorical['occupation'].unique()

array(['High skill tech staff', 'Not identified', 'Core staff',
       'Sales staff', 'Laborers', 'Cooking staff', 'Accountants',
       'Managers', 'Secretaries', 'Drivers', 'Private service staff',
       'Cleaning staff', 'Medicine staff', 'Realty agents', 'HR staff',
       'Security staff', 'Low-skill Laborers', 'Waiters/barmen staff',
       'IT staff'], dtype=object)

In [37]:
ResampleCategorical.to_csv("../Data/ResampledCategorical.csv")