In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./files/dataset_clean_without_fill.csv')

In [3]:
# On va recuperer toutes les colonnes de type categoricielles

categorical_columns = df.select_dtypes(include=['object', 'category']).columns

# Pour chaque colonnes, on va retirer les colonnes qui ont plus de 99% de valeurs manquantes

for col in categorical_columns:
    if df[col].isnull().sum() > 0.99 * len(df):
        df.drop(col, axis=1, inplace=True)

In [4]:
# On va supprimer les colonnes qui ont moins 6 valeurs uniques

for col in df.columns:
    if len(df[col].unique()) < 6:
        df.drop(col, axis=1, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4241 entries, 0 to 4240
Columns: 4139 entries, Var7 to Var14993
dtypes: float64(818), int64(3303), object(18)
memory usage: 133.9+ MB


In [6]:
df.head()

Unnamed: 0,Var7,Var11,Var13,Var17,Var20,Var21,Var22,Var25,Var27,Var28,...,Var14822,Var14893,Var14903,Var14904,Var14910,Var14913,Var14923,Var14965,Var14990,Var14993
0,0.0,0,0,6,0,47974,319480.0,0,0,1251252.0,...,4bZ_,VicHEoo,_lnU,uUxXC7veal,py8m,JKc4,eUNaPPAA49Enq,StWh,0200,IPyde4c
1,0.0,0,0,10,0,116530,0.0,0,0,0.0,...,yFwt,J7IUB91,_lnU,toseir_,py8m,2e81,ILESZvp,e5M2,X7HG,tQAVcMq
2,0.0,0,0,0,0,0,1745640.0,0,0,1321164.0,...,GMbg,NqRDYUf,_lnU,QMUlWPt,py8m,z0H8,ILESZvp,hgOy,77a1,mnKkkvG
3,0.0,0,0,0,0,0,63192.0,0,0,270450.0,...,WhKH,0eyRkLO,MgxIGSo,Qle7bvR,py8m,mZkv,2PEs,MT6F,ABkU,ZF0bafC
4,0.0,0,0,0,0,0,12200.0,0,0,2592000.0,...,GrEx,_tp5CAA,xfMf,Qle7bvR,py8m,1zA6,ED3wDbnc4Y,DX_r,0200,z1miBkG


In [7]:
def build_dummy_vals(df_arg, nb_dummy_max_arg, ratio_max_arg):
    dummy_val_dict = {}
    cols = [k for k,v in dict(df_arg.dtypes).items() if v.name in ['category', 'object']]
    for c in cols:
        vc = df_arg[c].value_counts()/sum(df_arg[c].value_counts()) > ratio_max_arg
        dummy_labels = [k for (k,v) in vc.items() if v][:nb_dummy_max_arg]
        dummy_val_dict[c] = dummy_labels
    return dummy_val_dict

def apply_dummy_vals(df_arg, dummy_dict):
    for c, dummy_labels in dummy_dict.items():
        tmp_enc = df_arg[c].map(lambda v: v if v in dummy_labels else 'other')
        dummies = pd.get_dummies(tmp_enc, prefix='dumm_'+c)
        df_arg = pd.concat([df_arg, dummies], axis=1)
    return df_arg

In [8]:
nb_dummy_max = 5
ratio_max = .01

dummy_dict = build_dummy_vals(df, nb_dummy_max, ratio_max)
dummy_df = apply_dummy_vals(df, dummy_dict)

In [9]:
dummy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4241 entries, 0 to 4240
Columns: 4230 entries, Var7 to dumm_Var14993_z1miBkG
dtypes: bool(91), float64(818), int64(3303), object(18)
memory usage: 134.3+ MB


In [10]:
dummy_df.to_csv('./files/dummy_dataset.csv', index=False)