In [1]:
import pandas as pd
import numpy as np

from sklearn.utils import check_random_state
from collections import Counter
from scipy.sparse import hstack,vstack
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder

# import warnings
# warnings.filterwarnings('ignore')

In [2]:
jdta = pd.read_csv('./Data/jdata_train.csv',encoding='UTF-8',parse_dates=['time','dt','user_reg_dt'])

In [3]:
jdta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467969 entries, 0 to 467968
Data columns (total 26 columns):
sku_id              467969 non-null int64
attr1               467969 non-null float64
attr2               467969 non-null float64
attr3               467969 non-null float64
cate                467969 non-null int64
brand               467969 non-null int64
dt                  467969 non-null datetime64[ns]
comment_num         467969 non-null float64
has_bad_comment     467969 non-null float64
bad_comment_rate    467969 non-null float64
user_id             467969 non-null int64
type_1              467969 non-null float64
type_2              467969 non-null float64
type_3              467969 non-null float64
type_4              467969 non-null float64
type_5              467969 non-null float64
type_6              467969 non-null float64
time                467969 non-null datetime64[ns]
model_id            467969 non-null float64
day                 467969 non-null object
age 

In [4]:
jdta['duration'].head()

0    11.0
1    11.0
2    12.0
3    14.0
4    11.0
Name: duration, dtype: float64

In [5]:
def label(data):
    le=LabelEncoder()
    for col in data.columns.values:
        # Encoding only categorical variables
        if data[col].dtypes=='object':
        # Using whole data to form an exhaustive list of levels
            le.fit(data[col].values)
            data[col]=le.transform(data[col])

In [6]:
label(jdta)

In [8]:
jdta.columns

Index(['sku_id', 'attr1', 'attr2', 'attr3', 'cate', 'brand', 'dt',
       'comment_num', 'has_bad_comment', 'bad_comment_rate', 'user_id',
       'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6', 'time',
       'model_id', 'day', 'age', 'sex', 'user_lv_cd', 'user_reg_dt', 'label',
       'duration'],
      dtype='object')

In [9]:
jdta['label'].value_counts()

0.0    467518
1.0       451
Name: label, dtype: int64

In [20]:
var = ['sku_id', 'attr1', 'attr2', 'attr3', 'cate', 'brand',\
       'comment_num', 'has_bad_comment', 'bad_comment_rate', 'user_id',\
       'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6',\
       'model_id', 'day', 'age', 'sex', 'user_lv_cd', 'label',\
       'duration']

In [21]:
def label(data):
    le=LabelEncoder()
    for col in data.columns.values:
        # Encoding only categorical variables
        if data[col].dtypes=='object':
        # Using whole data to form an exhaustive list of levels
            le.fit(data[col].values)
            data[col]=le.transform(data[col])

In [22]:
label(jdta)

In [23]:
pred_var = var[:-1]

In [24]:
X = jdta[pred_var]
y = jdta['label']
print('Dataset shape {}'.format(Counter(y)))

ros = RandomUnderSampler(random_state=0,ratio=0.1)
X_res, y_res = ros.fit_sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_res)))

Dataset shape Counter({0.0: 467518, 1.0: 451})
Resampled dataset shape Counter({0.0: 4510, 1.0: 451})


In [25]:
y_res = y_res[:,None]

In [26]:
jdta_balanced = pd.DataFrame(np.concatenate([X_res,y_res],axis=1))
jdta_balanced.columns = var

In [27]:
jdta_balanced.head()

Unnamed: 0,sku_id,attr1,attr2,attr3,cate,brand,comment_num,has_bad_comment,bad_comment_rate,user_id,...,type_4,type_5,type_6,model_id,day,age,sex,user_lv_cd,label,duration
0,124507.0,3.0,1.0,2.0,8.0,214.0,3.0,1.0,0.15,36977.0,...,1.0,1.0,44.0,94.0,8.0,3.0,1.0,5.0,1.0,1.0
1,146704.0,1.0,1.0,1.0,8.0,800.0,2.0,1.0,0.1111,35914.0,...,1.0,1.0,9.0,22.0,18.0,3.0,0.0,5.0,1.0,1.0
2,146704.0,1.0,1.0,1.0,8.0,800.0,2.0,1.0,0.1111,78975.0,...,1.0,0.0,27.0,90.0,4.0,3.0,0.0,4.0,1.0,1.0
3,24771.0,-1.0,-1.0,1.0,8.0,857.0,4.0,1.0,0.0906,83152.0,...,1.0,0.0,8.0,34.0,14.0,3.0,2.0,3.0,1.0,1.0
4,28250.0,1.0,2.0,2.0,8.0,677.0,3.0,1.0,0.087,24842.0,...,1.0,0.0,6.0,17.0,14.0,3.0,0.0,5.0,1.0,1.0


In [28]:
jdta_balanced['label'].value_counts()

0.0    4510
1.0     451
Name: label, dtype: int64

In [29]:
jdta_balanced.head()

Unnamed: 0,sku_id,attr1,attr2,attr3,cate,brand,comment_num,has_bad_comment,bad_comment_rate,user_id,...,type_4,type_5,type_6,model_id,day,age,sex,user_lv_cd,label,duration
0,124507.0,3.0,1.0,2.0,8.0,214.0,3.0,1.0,0.15,36977.0,...,1.0,1.0,44.0,94.0,8.0,3.0,1.0,5.0,1.0,1.0
1,146704.0,1.0,1.0,1.0,8.0,800.0,2.0,1.0,0.1111,35914.0,...,1.0,1.0,9.0,22.0,18.0,3.0,0.0,5.0,1.0,1.0
2,146704.0,1.0,1.0,1.0,8.0,800.0,2.0,1.0,0.1111,78975.0,...,1.0,0.0,27.0,90.0,4.0,3.0,0.0,4.0,1.0,1.0
3,24771.0,-1.0,-1.0,1.0,8.0,857.0,4.0,1.0,0.0906,83152.0,...,1.0,0.0,8.0,34.0,14.0,3.0,2.0,3.0,1.0,1.0
4,28250.0,1.0,2.0,2.0,8.0,677.0,3.0,1.0,0.087,24842.0,...,1.0,0.0,6.0,17.0,14.0,3.0,0.0,5.0,1.0,1.0


In [31]:
jdta_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4961 entries, 0 to 4960
Data columns (total 23 columns):
sku_id              4961 non-null float64
attr1               4961 non-null float64
attr2               4961 non-null float64
attr3               4961 non-null float64
cate                4961 non-null float64
brand               4961 non-null float64
comment_num         4961 non-null float64
has_bad_comment     4961 non-null float64
bad_comment_rate    4961 non-null float64
user_id             4961 non-null float64
type_1              4961 non-null float64
type_2              4961 non-null float64
type_3              4961 non-null float64
type_4              4961 non-null float64
type_5              4961 non-null float64
type_6              4961 non-null float64
model_id            4961 non-null float64
day                 4961 non-null float64
age                 4961 non-null float64
sex                 4961 non-null float64
user_lv_cd          4961 non-null float64
label      

In [30]:
jdta_balanced.to_csv('./Data/train_balanced.csv',index=False,na_rep=-1)