In [3]:
''' Importing the dataset in python '''

# pip install ucimlrepo

from ucimlrepo import fetch_ucirepo 

# fetch dataset 
darwin = fetch_ucirepo(id=732) 
  
# data (as pandas dataframes) 
X = darwin.data.features 
y = darwin.data.targets 
  
# metadata 
print(darwin.metadata) 
  
# variable information 
print(darwin.variables) 

{'uci_id': 732, 'name': 'DARWIN', 'repository_url': 'https://archive.ics.uci.edu/dataset/732/darwin', 'data_url': 'https://archive.ics.uci.edu/static/public/732/data.csv', 'abstract': 'The DARWIN dataset includes handwriting data from 174 participants. The classification task consists in distinguishing Alzheimer’s disease patients from healthy people.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 174, 'num_features': 451, 'feature_types': [], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2022, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C55D0K', 'creators': ['Francesco Fontanella'], 'intro_paper': {'title': 'Diagnosing Alzheimer’s disease from on-line handwriting: A novel dataset and performance benchmarking', 'authors': 'Nicole D. Cilia, Giuseppe De Gregorio , Claudio De Stefano, Francesco Fontanella, 

In [47]:
'''
===============================
** HERE BEGINS OUR ANALYSIS **
===============================
'''



In [48]:
''' imports '''
import pandas as pd

In [49]:
'''
—————————————————————————————————
Data exploration & preprocessing
—————————————————————————————————
'''

'\n—————————————————————————————————\nData exploration & preprocessing\n—————————————————————————————————\n'

In [50]:
''' Check for missing values '''

# Check for missing values in the features
missing_values_features = X.isnull().sum()
print("Missing values in features:")
print(missing_values_features)

rows_with_missing_values = X.isnull().any(axis=1).sum()
print(f"\nNumber of rows with at least one missing value in features: {rows_with_missing_values}")

# Check for missing values in the target (label)
missing_values_targets = y.isnull().sum()
print("\nMissing values in target:")
print(missing_values_targets)

Missing values in features:
ID                  0
air_time1           0
disp_index1         0
gmrt_in_air1        0
gmrt_on_paper1      0
                   ..
num_of_pendown25    0
paper_time25        0
pressure_mean25     0
pressure_var25      0
total_time25        0
Length: 451, dtype: int64

Number of rows with at least one missing value in features: 0

Missing values in target:
class    0
dtype: int64


In [51]:
''' convert darwin into a pandas dataframe '''

# Combine X and y into a single DataFrame (since they are both imported as dataframes)
data_df = pd.concat([X, y], axis=1)
data_df

Unnamed: 0,ID,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,...,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25,class
0,id_1,5160,0.000013,120.804174,86.853334,957,6601,0.361800,0.217459,103.828754,...,0.141434,0.024471,5.596487,3.184589,71,40120,1749.278166,296102.7676,144605,P
1,id_2,51980,0.000016,115.318238,83.448681,1694,6998,0.272513,0.144880,99.383459,...,0.049663,0.018368,1.665973,0.950249,129,126700,1504.768272,278744.2850,298640,P
2,id_3,2600,0.000010,229.933997,172.761858,2333,5802,0.387020,0.181342,201.347928,...,0.178194,0.017174,4.000781,2.392521,74,45480,1431.443492,144411.7055,79025,P
3,id_4,2130,0.000010,369.403342,183.193104,1756,8159,0.556879,0.164502,276.298223,...,0.113905,0.019860,4.206746,1.613522,123,67945,1465.843329,230184.7154,181220,P
4,id_5,2310,0.000007,257.997131,111.275889,987,4732,0.266077,0.145104,184.636510,...,0.121782,0.020872,3.319036,1.680629,92,37285,1841.702561,158290.0255,72575,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,id_170,2930,0.000010,241.736477,176.115957,1839,6439,0.253347,0.174663,208.926217,...,0.119152,0.020909,4.508709,2.233198,96,44545,1798.923336,247448.3108,80335,H
170,id_171,2140,0.000009,274.728964,234.495802,2053,8487,0.225537,0.174920,254.612383,...,0.174495,0.017640,4.685573,2.806888,84,37560,1725.619941,160664.6464,345835,H
171,id_172,3830,0.000008,151.536989,171.104693,1287,7352,0.165480,0.161058,161.320841,...,0.114472,0.017194,3.493815,2.510601,88,51675,1915.573488,128727.1241,83445,H
172,id_173,1760,0.000008,289.518195,196.411138,1674,6946,0.518937,0.202613,242.964666,...,0.114472,0.017194,3.493815,2.510601,88,51675,1915.573488,128727.1241,83445,H


In [52]:
''' data set exploration '''
print("Column names:", data_df.columns)

Column names: Index(['ID', 'air_time1', 'disp_index1', 'gmrt_in_air1', 'gmrt_on_paper1',
       'max_x_extension1', 'max_y_extension1', 'mean_acc_in_air1',
       'mean_acc_on_paper1', 'mean_gmrt1',
       ...
       'mean_jerk_in_air25', 'mean_jerk_on_paper25', 'mean_speed_in_air25',
       'mean_speed_on_paper25', 'num_of_pendown25', 'paper_time25',
       'pressure_mean25', 'pressure_var25', 'total_time25', 'class'],
      dtype='object', length=452)


In [53]:
''' 
    25 tasks with 18 features in each;
    the 18 features are the same for each task:
    in the dataframe, all features belonging to task n have "n" at the end of their name
'''

# explore the 18 unique features, for example of task 25

# Filter columns that end with '25'
columns_ending_with_25 = [col for col in data_df.columns if col.endswith('25')]

# Display type and other info for these columns
for col in columns_ending_with_25:
    print(f"Column: {col}")
    print(f"Type: {data_df[col].dtype}")
    print(f"Unique values: {data_df[col].nunique()}")
    print(f"Sample data:\n{data_df[col].head()}")
    print()

Column: air_time25
Type: int64
Unique values: 160
Sample data:
0    104485
1    171940
2     33545
3    113275
4     35290
Name: air_time25, dtype: int64

Column: disp_index25
Type: float64
Unique values: 129
Sample data:
0    0.000049
1    0.000070
2    0.000056
3    0.000058
4    0.000043
Name: disp_index25, dtype: float64

Column: gmrt_in_air25
Type: float64
Unique values: 161
Sample data:
0    279.628181
1     86.117902
2    215.379542
3    207.557650
4    167.510556
Name: gmrt_in_air25, dtype: float64

Column: gmrt_on_paper25
Type: float64
Unique values: 161
Sample data:
0    219.829989
1     68.398886
2    171.954494
3    118.573956
4    126.678802
Name: gmrt_on_paper25, dtype: float64

Column: max_x_extension25
Type: int64
Unique values: 159
Sample data:
0    10066
1     7365
2     7688
3     6397
4     4624
Name: max_x_extension25, dtype: int64

Column: max_y_extension25
Type: int64
Unique values: 159
Sample data:
0    13235
1    15282
2    14127
3    14913
4    15532
Name: max

In [54]:
'''
From the reference paper (https://doi.org/10.1016/j.engappai.2022.104822)
- Total Time (TT): Total time spent to perform the entire task.
- Air Time (AT): Time spent to perform in-air movements.
- Paper Time (PT): Time spent to perform on-paper movements.
- Mean Speed on-paper (MSP): Average speed of on-paper movements. Speed is the variation of displacement with respect to time.
- Mean Speed in-air (MSA): Average speed of in-air movements.
- Mean Acceleration on-paper (MAP): Average acceleration of on-paper movements. Acceleration is the variation of speed with respect to time.
- Mean Acceleration in-air (MAA): Average acceleration of in-air movements.
- Mean Jerk on-paper (MJP): Average jerk of on-paper movements. Jerk is the variation of acceleration with respect to time.
- Mean Jerk in-air (MJA): Average jerk of in-air movements.
- Pressure Mean (PM): Average of the pressure levels exerted by the pen tip.
- Pressure Var (PV): Variance of the pressure levels exerted by the pen tip.
- GMRT on-paper (GMRTP): Generalization of the Mean Relative Tremor (MRT). MRT measures the amount of tremor in drawing spirals and meanders. 
- GMRT in-air (GMRTA): Generalization of the Mean Relative Tremor computed on in air movements.
- Mean GMRT (GMRT): Average of GMRTP and GMRTA.
- Pendowns Number (PWN): Counts the total number of pendowns recorded during the execution of the entire task (e.g., a continuous uninterrupted line present a pendowns number equal to 1).
- Max X Extension (XE): Maximum extension recorded along the X axis. The maximum extension of a component along an axis is calculated considering the difference between its farthest/nearest points tothe origin on the considered axis.
- Max Y Extension (YE): Maximum extension recorded along the Y axis. Computed the same as the XE feature, but taken into account the 𝑦 axis.
- Dispersion Index (DI): The Dispersion Index measures how the handwritten trace is ‘‘dispersed’’ on the entire piece of paper; in other words, it measures how much of the sheet is covered.
'''

'\nFrom the reference paper (https://doi.org/10.1016/j.engappai.2022.104822)\n- Total Time (TT): Total time spent to perform the entire task.\n- Air Time (AT): Time spent to perform in-air movements.\n- Paper Time (PT): Time spent to perform on-paper movements.\n- Mean Speed on-paper (MSP): Average speed of on-paper movements. Speed is the variation of displacement with respect to time.\n- Mean Speed in-air (MSA): Average speed of in-air movements.\n- Mean Acceleration on-paper (MAP): Average acceleration of on-paper movements. Acceleration is the variation of speed with respect to time.\n- Mean Acceleration in-air (MAA): Average acceleration of in-air movements.\n- Mean Jerk on-paper (MJP): Average jerk of on-paper movements. Jerk is the variation of acceleration with respect to time.\n- Mean Jerk in-air (MJA): Average jerk of in-air movements.\n- Pressure Mean (PM): Average of the pressure levels exerted by the pen tip.\n- Pressure Var (PV): Variance of the pressure levels exerted by

In [55]:
''' Checking the categorical and numerical features '''

# Initialize counters
numeric_count = 0
categorical_count = 0

# Determine the type of each column and count them
for col in columns_ending_with_25:
    if pd.api.types.is_numeric_dtype(data_df[col]):
        numeric_count += 1
    elif pd.api.types.is_categorical_dtype(data_df[col]) or data_df[col].dtype == 'object':
        categorical_count += 1

# Print the results
print(f"Total numerical features ending with '25': {numeric_count}")
print(f"Total categorical features ending with '25': {categorical_count}")

# Reminder: we are considering those ending with 25 just like we could have considered any other number between 1 and 25 (we have 25 tasks and each has the same attributes)

Total numerical features ending with '25': 18
Total categorical features ending with '25': 0


In [56]:
''' 
we notice the absence of categorical features, while the description shown after the data set import
stated we had disp_index as categorical variable: let's check
'''

# Extract and print all values of the 'disp_index1' column
# Make sure 'disp_index1' is an actual column in X or y
if 'disp_index1' in X.columns:
    print("\nAll values in 'disp_index1':")
    print(X['disp_index1'])
else:
    print("\n'disp_index1' column not found in X")

''' 
we conclude that there is a mistake in the data set description: they wrote that 'disp_index1' is a categorical variable, but it is not 
(they even show the procedure to compute it, in the paper).
Here is the proof that it is actually a float
'''


All values in 'disp_index1':
0      0.000013
1      0.000016
2      0.000010
3      0.000010
4      0.000007
         ...   
169    0.000010
170    0.000009
171    0.000008
172    0.000008
173    0.000008
Name: disp_index1, Length: 174, dtype: float64


" \nwe conclude that there is a mistake in the data set metadata: they wrote that 'disp_index1' is a categorical variable, but it is not \n(they even show the procedure to compute it, in the paper.\nHere is the proof that it is actually a float\n"

In [57]:
'''
We have no missing values.
We can assume to have no noise nor outliers in the data set, due to the way data has been collected (cf. paper [https://doi.org/10.1016/j.engappai.2022.104822])
There are no inconsistencies, there is no data to integrate.

As data reduction is concerned, in particular in terms of dimensionality reduction, we'll dive into it later in the project.

All features are numerical.
'''

"\nWe have no missing values.\nWe can assume to have no noise nor outliers in the data set, due to the way data has been collected (cf. paper [https://doi.org/10.1016/j.engappai.2022.104822])\nThere are no inconsistencies, there is no data to integrate.\n\nAs data reduction is concerned, in particular in terms of dimensionality reduction, we'll dive into it later in the project.\n\nAll features are numerical.\n"