# Sampling

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

In [39]:
from imblearn.under_sampling import RandomUnderSampler

In [40]:
from sklearn.datasets import make_classification

In [41]:
cleanDummy = pd.read_csv('../Data/cleanDummy.csv')

In [5]:
cleanDummy.head()

Unnamed: 0.1,Unnamed: 0,totalIncome,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,genderR,ownsCarR,ownsRealtyR,eduLvlR,incomeTypeR,housingTypeR,famStatusR
0,0,427500.0,2.0,33.0,12.435574,1,1,1,1,1,0,0,0,0
1,31,112500.0,2.0,59.0,3.104787,1,1,1,1,1,1,0,1,1
2,61,270000.0,1.0,53.0,8.353354,1,1,0,0,1,1,1,1,2
3,137,283500.0,1.0,62.0,0.0,0,0,0,0,1,0,2,1,3
4,188,270000.0,2.0,47.0,2.10545,1,0,1,1,1,0,0,1,1


In [6]:
# Drop Unnamed:0 col
cleanDummy.drop('Unnamed: 0', axis=1, inplace=True) 

In [44]:
cleanDummy.head()

Unnamed: 0.1,Unnamed: 0,totalIncome,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,genderR,ownsCarR,ownsRealtyR,eduLvlR,incomeTypeR,housingTypeR,famStatusR
0,0,427500.0,2.0,33.0,12.435574,1,1,1,1,1,0,0,0,0
1,31,112500.0,2.0,59.0,3.104787,1,1,1,1,1,1,0,1,1
2,61,270000.0,1.0,53.0,8.353354,1,1,0,0,1,1,1,1,2
3,137,283500.0,1.0,62.0,0.0,0,0,0,0,1,0,2,1,3
4,188,270000.0,2.0,47.0,2.10545,1,0,1,1,1,0,0,1,1


In [8]:
cleanDummy['genderR'].value_counts()  # 0=F, 1=M

0    8561
1    4750
Name: genderR, dtype: int64

In [25]:
# Undersample female
# X is dropped genderR col, y is only genderR

X = cleanDummy.drop('genderR', axis=1)  
y = cleanDummy['genderR']


In [54]:
y

0        1
1        1
2        0
3        0
4        1
        ..
13306    0
13307    1
13308    0
13309    0
13310    1
Name: genderR, Length: 13311, dtype: int64

In [26]:
# taking number minor class and scaling down major class to match minor class 
sampler = RandomUnderSampler(sampling_strategy='auto')


In [27]:
X_res, y_res = sampler.fit_resample(X, y)


In [55]:
sampler.fit_resample?

[1;31mSignature:[0m [0msampler[0m[1;33m.[0m[0mfit_resample[0m[1;33m([0m[0mX[0m[1;33m,[0m [0my[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Resample the dataset.

Parameters
----------
X : {array-like, dataframe, sparse matrix} of shape                 (n_samples, n_features)
    Matrix containing the data which have to be sampled.

y : array-like of shape (n_samples,)
    Corresponding label for each sample in X.

Returns
-------
X_resampled : {array-like, dataframe, sparse matrix} of shape                 (n_samples_new, n_features)
    The array containing the resampled data.

y_resampled : array-like of shape (n_samples_new,)
    The corresponding label of `X_resampled`.
[1;31mFile:[0m      c:\python3\lib\site-packages\imblearn\base.py
[1;31mType:[0m      method


In [28]:
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 4750, 1: 4750})


In [29]:
counter = Counter(y_res)
print(counter)

Counter({0: 4750, 1: 4750})


In [50]:
X_res, y_res

(      totalIncome  famSize  ageYrs    yrsEmpl  UNEMPLOYED  ApprStatus  \
 0        238500.0      2.0    47.0  13.599184           1           1   
 1        135000.0      3.0    41.0   4.799551           1           0   
 2        157500.0      2.0    40.0  12.506759           1           0   
 3        225000.0      2.0    24.0   0.257363           1           1   
 4         67500.0      2.0    37.0   3.263585           1           1   
 ...           ...      ...     ...        ...         ...         ...   
 9495     112500.0      1.0    26.0   3.266323           1           0   
 9496     157500.0      4.0    40.0  12.265823           1           1   
 9497     112500.0      1.0    28.0   4.517547           1           0   
 9498     202500.0      2.0    30.0   3.600348           1           0   
 9499     202500.0      2.0    54.0   5.930307           1           1   
 
       ownsCarR  ownsRealtyR  eduLvlR  incomeTypeR  housingTypeR  famStatusR  
 0            0            1   

In [51]:
balancedMF = X_res, y_res

In [61]:
type(X_res)


pandas.core.frame.DataFrame

In [63]:
genderRcol= pd.Series.to_frame(y_res)

In [64]:
type(genderRcol)

pandas.core.frame.DataFrame

In [67]:
combinedResample = pd.concat( [X_res, genderRcol], axis = 1)

In [68]:
combinedResample

Unnamed: 0,totalIncome,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,ownsCarR,ownsRealtyR,eduLvlR,incomeTypeR,housingTypeR,famStatusR,genderR
0,238500.0,2.0,47.0,13.599184,1,1,0,1,0,0,1,1,0
1,135000.0,3.0,41.0,4.799551,1,0,1,1,1,0,1,1,0
2,157500.0,2.0,40.0,12.506759,1,0,0,1,1,0,1,1,0
3,225000.0,2.0,24.0,0.257363,1,1,0,1,2,1,1,0,0
4,67500.0,2.0,37.0,3.263585,1,1,0,1,1,0,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9495,112500.0,1.0,26.0,3.266323,1,0,0,1,1,0,0,2,1
9496,157500.0,4.0,40.0,12.265823,1,1,1,1,1,0,1,1,1
9497,112500.0,1.0,28.0,4.517547,1,0,1,1,1,0,1,2,1
9498,202500.0,2.0,30.0,3.600348,1,0,1,1,1,0,1,0,1


In [57]:
type(balancedMF)

tuple

In [None]:
#convert tuple to dF


In [69]:
combinedResample.to_csv("../Data/balancedMF.csv")