In [1]:
import json

import arff
import pandas as pd

In [2]:
file_path = './files/dataset.arff'

# Chargement des donnees avec dtype
with open(file_path, 'r') as f:
    data = arff.load(f)
df = pd.DataFrame(data['data'][:29000])

MemoryError: Unable to allocate 2.90 GiB for an array with shape (13414, 29000) and data type int64

In [None]:
with open("./files/dtype_using_linux", 'r') as f:
    cols_with_types = f.read()
cols_with_types = cols_with_types.replace("'", '"')
cols_with_types = json.loads(cols_with_types)
for col, dtype in cols_with_types.items():
    if dtype == "INTEGER":
        cols_with_types[col] = "int"
    elif dtype == "REAL":
        cols_with_types[col] = "float"
    elif dtype == "STRING":
        cols_with_types[col] = "str"

cols_with_types['upselling'] = "int"

cols = [col for col, _ in cols_with_types.items()]
dtypes = {col: dtype for col, dtype in cols_with_types.items()}

# On ajoute les colonnes et le dtype
df.columns = cols
df = df.astype(dtypes)

In [None]:
# Suppression des colonnes avec 10% de valeurs manquantes
missing_percentage = (df.isnull().sum() / len(df)) * 100
df = df.drop(columns=missing_percentage[missing_percentage > 10].index)

# Remplacement des valeurs manquantes par une constante avec fillna
for col in data.columns:
    if df[col].dtype == 'object':
        df[col].fillna("Inconnu", inplace=True)
    else:
        df[col].fillna(-1, inplace=True)

df.dropna(inplace=True)

In [2]:
df = pd.read_csv('./files/dataset_clean_without_fill.csv')

In [3]:
df.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var14732,Var14733,Var14734,Var14735,Var14736,Var14737,Var14738,Var14739,Var14740,upselling
count,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,...,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0,4241.0
mean,0.003773,0.004716,0.043386,0.043386,0.447064,0.016034,0.99452,0.010611,0.016977,0.003773,...,20970280.0,0.006602,0.008489,0.0,0.161283,0.825277,0.012969,0.053761,0.026409,-0.934449
std,0.245689,0.18803,2.591351,2.705341,1.575787,0.252772,11.354106,0.308879,0.234308,0.122801,...,13769290.0,0.162393,0.184093,0.0,0.970519,5.05385,0.254343,1.73285,0.32398,0.356138
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6595230.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23873850.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34977000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
max,16.0,10.0,168.0,176.0,6.0,4.0,362.56,9.0,6.0,4.0,...,39056400.0,4.0,4.0,0.0,6.0,196.0,5.0,69.0,4.0,1.0


In [4]:
df.isna().sum()

Var1         0
Var2         0
Var3         0
Var4         0
Var5         0
            ..
Var14989     0
Var14990     0
Var14993     0
Var14995     0
upselling    0
Length: 14397, dtype: int64

In [6]:
# On cree un dataset avec les memes proportions de upselling
upselling_df = df[df['upselling'] == 1]
no_upselling_df = df[df['upselling'] == -1]

no_upselling_df = no_upselling_df.sample(n=len(upselling_df), replace=False)

df = pd.concat([upselling_df, no_upselling_df])
df = df.sample(frac=1, random_state=42)

df.to_csv('./files/dataset_full_balanced.csv', index=False)


In [7]:
df["upselling"].value_counts()

upselling
 1    139
-1    139
Name: count, dtype: int64

In [8]:
df["upselling"].head()

739     1
3841    1
1286   -1
3906   -1
4234   -1
Name: upselling, dtype: int64