# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from  sklearn  import  set_config
set_config(display='diagram')

# Custom functions

In [3]:
def save_dataset(nome_file, file):
    with open(nome_file +'.pkl', 'wb') as f:
        pickle.dump(file, f)
    return

In [4]:
def dataset_parameters(df,target):
    df_features = df.drop(columns=target)
    data_type_numerical = ['int64', 'float64']
    data_type_object = ['object', 'bool', 'category']
    data_type_date = ['datetime64']
    categorical_features  = list(filter(lambda x: df_features[x].dtype in data_type_object, df_features.columns))
    numerical_features = list(filter(lambda x: df_features[x].dtype in data_type_numerical, df_features.columns))
    date_features = list(filter(lambda x: df_features[x].dtype in data_type_date, df_features.columns))
    all_features  = numerical_features  + categorical_features + date_features 
    return categorical_features, numerical_features, date_features, all_features,target  

In [5]:
def numeric_summary_parameters(df, column_names):
    result = {}
    for col in column_names:
        min_value = df[col].min()
        max_value = df[col].max()
        mean_value = df[col].mean()
        result[col] = [min_value, max_value, mean_value]
    return result

In [6]:
def dataframe_with_null(df):
    percentuale = round(df[df.isnull().any(axis=1)].shape[0]/df.shape[0]*100, 2)
    print('The dataset has {}% of records with at least one NaN value'.format(percentuale))

In [7]:
def unique_values_dataframe(df, categorical_features):
    result = {}
    for col in categorical_features:
        unique_values = df[col].unique()
        result[col] = unique_values.tolist()        
    unique_df = pd.DataFrame.from_dict(result, orient='index')
    unique_df = unique_df.transpose()
    return unique_df   

# Import dataset and visualize properties

In [8]:
df = pd.read_csv('reuters.csv')

In [9]:
df.sample(10, random_state=13)

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature241,feature242,feature243,label1,label2,label3,label4,label5,label6,label7
681,1.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,False,True,False,False,False,False,False
1601,1.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,True,False,False,False,False,False,False
819,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,False,False,True,False,False,True,False
910,1.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,11.0,...,0.0,0.0,0.0,False,False,False,True,False,False,False
330,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,False,True,False,False,False,False,False
1886,3.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,False,False,False,False,False,True,False
364,1.0,1.0,2.0,2.0,8.0,8.0,0.0,0.0,4.0,4.0,...,0.0,0.0,0.0,True,False,False,False,False,False,False
1603,1.0,1.0,1.0,1.0,6.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,True,False,False,False,False,False,False
1909,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,0.0,0.0,...,0.0,0.0,1.0,True,False,False,False,False,False,False
85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,...,0.0,0.0,0.0,True,False,False,False,False,False,False


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 250 entries, feature1 to label7
dtypes: bool(7), float64(243)
memory usage: 3.7 MB


# Remove unnecessary records

In [11]:
labels = ['label1', 'label2', 'label3', 'label4', 'label5', 'label6', 'label7']

In [12]:
categorical_features, numerical_features, date_features, all_features, target = dataset_parameters(df,labels)

In [13]:
dataframe_with_null(df)

The dataset has 0.0% of records with at least one NaN value


In [14]:
df[df.isnull().any(axis=1)]

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature241,feature242,feature243,label1,label2,label3,label4,label5,label6,label7


In [15]:
#df = df.dropna()

In [16]:
unique_values_dataframe(df, categorical_features).fillna('')

In [17]:
df.describe()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature234,feature235,feature236,feature237,feature238,feature239,feature240,feature241,feature242,feature243
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1.03,0.511,0.625,0.6935,1.0025,0.891,1.0655,0.809,1.2965,1.239,...,0.0725,0.119,0.077,0.072,0.075,0.107,0.0475,0.222,0.0795,0.097
std,0.572943,0.506959,0.592917,0.72029,1.726553,1.712479,2.467439,1.612395,2.611321,2.039598,...,0.339561,0.620509,0.395157,0.354794,0.370734,0.599776,0.255492,1.090555,0.428102,0.45243
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,3.0,3.0,7.0,14.0,14.0,28.0,15.0,43.0,17.0,...,5.0,9.0,9.0,6.0,4.0,15.0,3.0,11.0,5.0,6.0


In [18]:
target

['label1', 'label2', 'label3', 'label4', 'label5', 'label6', 'label7']

# Save DataFrame

In [19]:
save_dataset('reuters_df', df)

# Transform  categorical and numerical  features

In [20]:
transformers = [    
    ('scale', StandardScaler(), numerical_features)
]

In [21]:
ct = ColumnTransformer(transformers)

In [22]:
ct

In [23]:
df_transformed = ct.fit_transform(df)

In [24]:
df_transformed

array([[-1.79818392, -1.008223  , -1.05437376, ..., -0.20361695,
        -0.18574982, -0.2144512 ],
       [-0.05237429, -1.008223  , -1.05437376, ..., -0.20361695,
        -0.18574982, -0.2144512 ],
       [-0.05237429, -1.008223  , -1.05437376, ..., -0.20361695,
        -0.18574982,  4.20722294],
       ...,
       [-0.05237429,  0.96481614,  0.63262425, ..., -0.20361695,
        -0.18574982, -0.2144512 ],
       [-0.05237429, -1.008223  ,  0.63262425, ..., -0.20361695,
        -0.18574982, -0.2144512 ],
       [-0.05237429,  0.96481614,  0.63262425, ..., -0.20361695,
        -0.18574982, -0.2144512 ]])

# Apply labels to target

In [25]:
df[target]

Unnamed: 0,label1,label2,label3,label4,label5,label6,label7
0,False,False,False,True,False,False,False
1,False,False,False,True,False,False,True
2,False,True,False,False,False,False,False
3,False,False,True,False,False,False,False
4,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...
1995,False,False,False,False,True,False,False
1996,False,True,False,False,False,False,False
1997,True,False,False,False,False,False,False
1998,False,False,False,True,False,False,False


In [26]:
le = LabelEncoder()

In [27]:
for column in target:
    df[column] = le.fit_transform(df[column])

In [28]:
df[target]

Unnamed: 0,label1,label2,label3,label4,label5,label6,label7
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,1
2,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...
1995,0,0,0,0,1,0,0
1996,0,1,0,0,0,0,0
1997,1,0,0,0,0,0,0
1998,0,0,0,1,0,0,0


# Save X and y dataset

In [29]:
dataset_X_y = df_transformed, df[target].values

In [30]:
save_dataset('reuters_dataset', dataset_X_y)

In [31]:
dataset_X_y

(array([[-1.79818392, -1.008223  , -1.05437376, ..., -0.20361695,
         -0.18574982, -0.2144512 ],
        [-0.05237429, -1.008223  , -1.05437376, ..., -0.20361695,
         -0.18574982, -0.2144512 ],
        [-0.05237429, -1.008223  , -1.05437376, ..., -0.20361695,
         -0.18574982,  4.20722294],
        ...,
        [-0.05237429,  0.96481614,  0.63262425, ..., -0.20361695,
         -0.18574982, -0.2144512 ],
        [-0.05237429, -1.008223  ,  0.63262425, ..., -0.20361695,
         -0.18574982, -0.2144512 ],
        [-0.05237429,  0.96481614,  0.63262425, ..., -0.20361695,
         -0.18574982, -0.2144512 ]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 1, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]]))

# Save processed DataFrame

In [32]:
df_processed = pd.DataFrame(ct.fit_transform(df), columns=ct.get_feature_names_out())

In [33]:
df_processed.columns = [col.split('__')[1] for col in df_processed.columns]

In [34]:
df_processed = pd.concat([df_processed, df[target]], axis=1)

In [35]:
save_dataset('reuters_df_processed', df_processed)

In [36]:
df_processed.sample(10)

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature241,feature242,feature243,label1,label2,label3,label4,label5,label6,label7
1805,-0.052374,-1.008223,0.632624,8.757695,2.315886,0.647761,0.378828,0.738837,-0.496616,0.373206,...,-0.203617,-0.18575,-0.214451,0,1,0,0,0,0,0
1993,-0.052374,0.964816,2.319622,1.814307,-0.001448,0.063666,0.378828,0.738837,-0.496616,-0.607625,...,-0.203617,-0.18575,-0.214451,0,1,0,0,0,0,0
346,-1.798184,-1.008223,-1.054374,-0.963048,-0.580782,-0.520428,-0.431932,-0.501863,-0.496616,-0.607625,...,-0.203617,-0.18575,-0.214451,0,0,0,0,1,0,0
965,-0.052374,-1.008223,-1.054374,-0.963048,-0.580782,-0.520428,-0.431932,-0.501863,-0.113572,-0.607625,...,-0.203617,-0.18575,-0.214451,0,0,0,0,0,1,0
656,-0.052374,-1.008223,-1.054374,0.42563,-0.580782,-0.520428,-0.431932,-0.501863,-0.496616,-0.607625,...,-0.203617,-0.18575,-0.214451,0,0,0,0,0,1,0
1368,-0.052374,-1.008223,-1.054374,0.42563,1.157219,1.231856,-0.431932,-0.501863,-0.496616,-0.607625,...,-0.203617,-0.18575,-0.214451,0,1,0,0,0,0,0
747,-0.052374,-1.008223,-1.054374,-0.963048,-0.580782,0.063666,-0.431932,-0.501863,-0.496616,-0.607625,...,-0.203617,-0.18575,-0.214451,0,0,1,0,0,0,0
313,-0.052374,0.964816,0.632624,0.42563,-0.580782,-0.520428,-0.431932,-0.501863,-0.496616,-0.607625,...,-0.203617,-0.18575,-0.214451,0,1,0,0,0,0,0
1021,-0.052374,0.964816,2.319622,1.814307,-0.580782,-0.520428,-0.026552,0.118487,-0.496616,-0.607625,...,-0.203617,-0.18575,-0.214451,0,1,0,0,0,0,0
839,-0.052374,0.964816,0.632624,0.42563,0.577885,0.647761,0.378828,0.738837,-0.496616,-0.607625,...,-0.203617,-0.18575,-0.214451,1,0,0,0,0,0,0


# Save target classes dictionary

In [37]:
class_dict = dict(zip(le.transform(le.classes_), le.classes_))

In [38]:
save_dataset('reuters_labels_classes', class_dict)

In [39]:
class_dict

{0: False, 1: True}