# Touchscreen Dataset Analysis

### Package Import and Path Configuration
This section imports all required packages and defines the file paths needed for the analysis.

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.experimental import enable_iterative_imputer  # IterativeImputer
from sklearn.impute import IterativeImputer
from scipy import stats

# Set the path to the directory containing the raw data files
path = "../main/data/UK_BIOBANK_DATA" 
path_graphs = "../main/doutputs/graphs" 
path_results = "../main/doutputs/results" 

The phenotype dataset is loaded. This dataset is provided with a tab separator format. For that reason, the following code was used.

In [3]:
#Charge dataframes
#Touchscreen
touchscreen_df = pd.read_csv(os.path.join(path, "touchscreen_data.tsv"), sep="\t")

In [4]:
# Display the first few rows of the touchscreen data
touchscreen_df.head()

Unnamed: 0,eid,f_670_0_0,f_670_1_0,f_670_2_0,f_670_3_0,f_680_0_0,f_680_1_0,f_680_2_0,f_680_3_0,f_699_0_0,...,f_22037_2_0,f_22037_3_0,f_22038_0_0,f_22038_1_0,f_22038_2_0,f_22038_3_0,f_22039_0_0,f_22039_1_0,f_22039_2_0,f_22039_3_0
0,1000012,1.0,,,,1.0,,,,4.0,...,,,,,,,,,,
1,1000021,1.0,,,,2.0,,,,24.0,...,,,4320.0,,,,320.0,,,
2,1000035,1.0,,,,3.0,,,,10.0,...,,,0.0,,,,0.0,,,
3,1000044,1.0,1.0,,,2.0,2.0,,,27.0,...,,,0.0,240.0,,,0.0,960.0,,
4,1000053,1.0,,,,1.0,,,,32.0,...,,,240.0,,,,0.0,,,


The chars dataset is loaded, which contains information relevant to the original dataset. This is done to gain more information, better understand the dataset, and make future associations.

In [5]:
# Olink Characters
# Open the original TSV file and skip the first 5 header lines
with open(os.path.join(path, 'touchscreen_chars.tsv'), 'r') as f:
    # Read all lines and discard the first five
    lines = f.readlines()[5:]

# Write the remaining lines to a new TSV file
with open(os.path.join(path, 'touchscreen_chars_table.tsv'), 'w') as f:
    # Output the extracted lines without modification
    f.writelines(lines)

In [6]:
# Load the filtered TSV into a DataFrame, using tab as the separator
touchscreen_chars_df = pd.read_csv(
    os.path.join(path, 'touchscreen_chars_table.tsv'),
    sep="\t"
)

# Create a new column for protein names and reorder the DataFrame
# Get the list of existing column names
cols = touchscreen_chars_df.columns.tolist()

# Insert a 'pname' column just before 'ValueType', extracting
# the protein name from the 'Field' column (text before the first semicolon)
touchscreen_chars_df.insert(
    cols.index('ValueType'),
    'pname',
    touchscreen_chars_df['Field'].str.split(";").str[0]
)

# Display the first few rows of the updated DataFrame
touchscreen_chars_df.head()

Unnamed: 0,Field,FieldID,pname,ValueType,Coding,Category,Participants,Items,Stability,Units,ItemType,Strata,Sexed,Instances,Array,Notes,Link,Path
0,Type of accommodation lived in,670,Type of accommodation lived in,Categorical single,100286.0,100066,501527,597706,Complete,,Data,Primary,Unisex,4,1,"ACE touchscreen question """"""""What type of acco...",http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=670,Assessment centre > Touchscreen > Sociodemogra...
1,Own or rent accommodation lived in,680,Own or rent accommodation lived in,Categorical single,100287.0,100066,500149,596187,Complete,,Data,Primary,Unisex,4,1,"ACE touchscreen question """"""""Do you own or ren...",http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=680,Assessment centre > Touchscreen > Sociodemogra...
2,Length of time at current address,699,Length of time at current address,Integer,100290.0,100066,501523,597702,Complete,years,Data,Primary,Unisex,4,1,"ACE touchscreen question """"""""How many years ha...",http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=699,Assessment centre > Touchscreen > Sociodemogra...
3,Number in household,709,Number in household,Integer,100291.0,100066,500147,596185,Complete,people,Data,Primary,Unisex,4,1,"ACE touchscreen question """"""""Including yoursel...",http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=709,Assessment centre > Touchscreen > Sociodemogra...
4,Number of vehicles in household,728,Number of vehicles in household,Categorical single,100293.0,100066,500147,596185,Complete,,Data,Primary,Unisex,4,1,"ACE touchscreen question """"""""How many cars or ...",http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=728,Assessment centre > Touchscreen > Sociodemogra...


## Preprocessing

### Missing Values

In [7]:
# Extract unique suffixes from column names (excluding 'eid')
col_names = touchscreen_df.columns
suffixes = []

# Loop through each column name
for col in col_names:
    if col!= 'eid':
        # Split the column name by underscores
        parts = col.split('_')
        # If the column name has at least 4 parts, extract the 3rd and 4th parts as the suffix
        if len(parts) >= 4:
            suffixes_p = f"{parts[2]}_{parts[3]}" 
            suffixes.append(suffixes_p)

# Get a sorted list of unique suffixes found in the column names
unique_suffixes = sorted(set(suffixes))
print(unique_suffixes)

['0_0', '0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '1_0', '1_1', '1_2', '1_3', '1_4', '2_0', '2_1', '2_2', '2_3', '2_4', '2_5', '3_0', '3_1', '3_2', '3_3']


In [8]:
for suffix in unique_suffixes:
    # Filter columns by the current suffix
    suffix_cols = touchscreen_df.filter(regex=fr'{suffix}$')
    
    # Calculate total NA values and total cells (rows × columns)
    total_na = suffix_cols.isna().sum().sum()
    total_cells = suffix_cols.size  # Equivalent to: rows × columns
    
    # Avoid division by zero if no columns match the suffix
    if total_cells == 0:
        print(f"No columns found with suffix '{suffix}'")
        continue
    
    # Compute percentage of NA values
    na_percentage = (total_na / total_cells) * 100
    
    # Print results
    print(f"Suffix '{suffix}': {total_na} NA values ({na_percentage:.2f}%)")

Suffix '0_0': 90061049 NA values (47.32%)
Suffix '0_1': 15148149 NA values (81.53%)
Suffix '0_2': 15173220 NA values (94.42%)
Suffix '0_3': 7778577 NA values (96.81%)
Suffix '0_4': 4466925 NA values (98.83%)
Suffix '0_5': 2501892 NA values (99.64%)
Suffix '0_6': 500527 NA values (99.67%)
Suffix '1_0': 154342408 NA values (97.26%)
Suffix '1_1': 11408922 NA values (98.78%)
Suffix '1_2': 5988521 NA values (99.38%)
Suffix '1_3': 2500727 NA values (99.59%)
Suffix '1_4': 501490 NA values (99.86%)
Suffix '2_0': 154733865 NA values (90.89%)
Suffix '2_1': 13632204 NA values (96.95%)
Suffix '2_2': 9409354 NA values (98.62%)
Suffix '2_3': 4480205 NA values (99.13%)
Suffix '2_4': 2503861 NA values (99.72%)
Suffix '2_5': 501677 NA values (99.90%)
Suffix '3_0': 138174108 NA values (98.97%)
Suffix '3_1': 8501108 NA values (99.58%)
Suffix '3_2': 3003144 NA values (99.67%)
Suffix '3_3': 1001419 NA values (99.71%)


According to the previous results, it is evident that the tests were conducted in three (3) stages. Across these stages, only instances 0 and 1 were retained. It can be observed that instance 0 contains fewer NA values from the stage 1. Therefore, instance 0 is selected for further analysis, as it corresponds to the stage with the least amount of missing data.

In [9]:
# Select columns that match the pattern 'f_\d+_0_\d+' and include 'eid'
selected_columns = ['eid'] + [
    col for col in touchscreen_df.columns
    if re.match(r'^f_\d+_0_\d+$', col)
]

# Create a new DataFrame with the selected columns
touchscreen_dr_df = touchscreen_df[selected_columns]

# Display the first few rows of the new DataFrame
touchscreen_dr_df.head()


Unnamed: 0,eid,f_670_0_0,f_680_0_0,f_699_0_0,f_709_0_0,f_728_0_0,f_738_0_0,f_757_0_0,f_767_0_0,f_777_0_0,...,f_20162_0_0,f_21000_0_0,f_22032_0_0,f_22033_0_0,f_22034_0_0,f_22035_0_0,f_22036_0_0,f_22037_0_0,f_22038_0_0,f_22039_0_0
0,1000012,1.0,1.0,4.0,3.0,3.0,3.0,,,,...,,1001.0,,,,,,,,
1,1000021,1.0,2.0,24.0,4.0,2.0,3.0,15.0,50.0,6.0,...,0.7125,1001.0,2.0,16.0,370.0,1.0,1.0,3564.0,4320.0,320.0
2,1000035,1.0,3.0,10.0,3.0,1.0,1.0,,,,...,0.462963,1001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000044,1.0,2.0,27.0,4.0,3.0,2.0,3.0,35.0,5.0,...,,1001.0,0.0,2.0,15.0,0.0,0.0,99.0,0.0,0.0
4,1000053,1.0,1.0,32.0,2.0,3.0,3.0,,,,...,,1001.0,1.0,10.0,50.0,0.0,1.0,693.0,240.0,0.0


In [10]:
# Size of the DataFrame after filtering
touchscreen_dr_df.shape

(502180, 480)

We verify whether duplicate records exist according to the 'eid' column.

In [11]:
# Check for duplicates in 'eid'
duplicates = touchscreen_dr_df[touchscreen_df["eid"].duplicated(keep=False)]

# Display duplicate rows (if any)
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate rows in 'eid':")
    display(duplicates)
else:
    print("No duplicates found in 'eid'.")

No duplicates found in 'eid'.


The type of data contained in the dataset is verified

In [12]:
# Check data types of all columns
data_types = touchscreen_dr_df.dtypes

# Count the number of columns per data type
type_counts = data_types.value_counts()

# Print results
for dtype, count in type_counts.items():
    print(f"Data type: {dtype} | Number of columns: {count}")

print("\nTotal number of columns:", len(touchscreen_dr_df.columns))

Data type: float64 | Number of columns: 479
Data type: int64 | Number of columns: 1

Total number of columns: 480


With the dimensionality reduction previously performed, the amount of null data is looked at again.

In [13]:
missing_percentage = touchscreen_dr_df.isna().mean() * 100
#missing_percentage = missing_percentage[missing_percentage > 0].sort_values(ascending=False)
# Display the percentage of missing values for each column, with a % symbol
print("\nPercentage of missing values in each column:")
for col, pct in missing_percentage.items():
    print(f"{col}: {pct:.2f}%")


Percentage of missing values in each column:
eid: 0.00%
f_670_0_0: 0.17%
f_680_0_0: 0.45%
f_699_0_0: 0.17%
f_709_0_0: 0.45%
f_728_0_0: 0.45%
f_738_0_0: 1.20%
f_757_0_0: 42.87%
f_767_0_0: 42.87%
f_777_0_0: 43.33%
f_796_0_0: 47.15%
f_806_0_0: 42.87%
f_816_0_0: 42.87%
f_826_0_0: 42.87%
f_845_0_0: 32.98%
f_864_0_0: 0.17%
f_874_0_0: 4.90%
f_884_0_0: 0.18%
f_894_0_0: 18.20%
f_904_0_0: 0.18%
f_914_0_0: 41.35%
f_924_0_0: 0.56%
f_943_0_0: 1.31%
f_971_0_0: 30.16%
f_981_0_0: 30.16%
f_991_0_0: 90.04%
f_1001_0_0: 90.04%
f_1011_0_0: 50.54%
f_1021_0_0: 50.54%
f_1031_0_0: 0.93%
f_1050_0_0: 0.93%
f_1060_0_0: 0.93%
f_1070_0_0: 0.18%
f_1080_0_0: 0.93%
f_1090_0_0: 0.22%
f_1100_0_0: 0.22%
f_1110_0_0: 0.18%
f_1120_0_0: 15.76%
f_1130_0_0: 15.76%
f_1140_0_0: 15.76%
f_1150_0_0: 15.76%
f_1160_0_0: 0.18%
f_1170_0_0: 0.93%
f_1180_0_0: 0.93%
f_1190_0_0: 0.18%
f_1200_0_0: 0.18%
f_1210_0_0: 0.18%
f_1220_0_0: 0.18%
f_1239_0_0: 0.18%
f_1249_0_0: 7.99%
f_1259_0_0: 8.66%
f_1269_0_0: 7.91%
f_1279_0_0: 7.91%
f_1289_0_0: 

Attributes with a percentage greater than 90% will be eliminated from the analysis, following the recommendation made by UK Bank (the institution that owns the data).

In [14]:
touchscreen_dr_df = touchscreen_dr_df.loc[:, missing_percentage <= 90]

# Display the shape of the DataFrame after filtering
touchscreen_dr_df.shape

(502180, 292)

Elimination of non-relevant questions integer

| Field ID  | Question |
|-----------|-----------|
| 699       | Length of time at current address   |
| 709       | Number in household |
| 1070      | Time spent watching television (TV) |
| 1080      | Time spent using computer |
| 1845      | Mother's age |
| 2946      | Father's age |
| 3659      | Year immigrated to UK (United Kingdom) |
| 3672      | Number of adopted brothers |
| 3682      | Number of adopted sisters |

Elimination of non-relevant questions categorical single

| Field ID  | Question |
|-----------|-----------|
| 670       | Type of accommodation lived in   |
| 780       | Own or rent accommodation lived in |
| 1011      | Frequency of light DIY in last 4 weeks |
| 1021      | Duration of light DIY |
| 1031      | Frequency of friend/family visits |
| 1707      | Handedness (chirality/laterality) |
| 1767      | Adopted as a child |
| 1797      | Father still alive |
| 1835      | Mother still alive |
| 2237      | Plays computer games |
| 2624      | Frequency of heavy DIY in last 4 weeks |
| 2634      | Duration of heavy DIY |
| 3912      | Adopted father still alive |
| 3942      | Adopted mother still alive |
| 4825      | Noisy workplace |
| 4836      | Loud music exposure frequency |
| 10105     | Internet user (pilot) |
| 10114     | Willing to be contacted by email (pilot) |
| 10740     | Frequency of friend/family visits (pilot) |

Elimination of non-relevant questions categorical multiple

| Field ID  | Question |
|-----------|-----------|
| 6138      | Qualifications  |
| 6139      | Gas or solid-fuel cooking/heating  |
| 6140      | Heating type(s) in home |
| 6143      | Transport type for commuting to job workplace |


In [15]:
# List of irrelevant Field IDs to drop
irrelevant_ids = [
    # Integer
    "699", "709", "1070", "1080", "1845", "2946", "3659", "3672", "3682",

    # Categorical single
    "670", "780", "1011", "1021", "1031", "1707", "1767", "1797", "1835",
    "2237", "2624", "2634", "3912", "3942", "4825", "4836", "10105", "10114", "10740",

    # Categorical multiple
    "6138", "6139", "6140", "6143"
]

# Create a regex pattern to match columns with the specified IDs
pattern = re.compile(rf"^f_({'|'.join(irrelevant_ids)})_\d+_\d+$")

# Find columns that match the pattern
columns_to_drop = [col for col in touchscreen_dr_df.columns if pattern.match(col)]

# Drop the identified columns from the DataFrame
touchscreen_dr_df = touchscreen_dr_df.drop(columns=columns_to_drop)

# Validate the drop by checking the remaining columns
touchscreen_dr_df.head()

Unnamed: 0,eid,f_680_0_0,f_728_0_0,f_738_0_0,f_757_0_0,f_767_0_0,f_777_0_0,f_796_0_0,f_806_0_0,f_816_0_0,...,f_20162_0_0,f_21000_0_0,f_22032_0_0,f_22033_0_0,f_22034_0_0,f_22035_0_0,f_22036_0_0,f_22037_0_0,f_22038_0_0,f_22039_0_0
0,1000012,1.0,3.0,3.0,,,,,,,...,,1001.0,,,,,,,,
1,1000021,2.0,2.0,3.0,15.0,50.0,6.0,2.0,4.0,2.0,...,0.7125,1001.0,2.0,16.0,370.0,1.0,1.0,3564.0,4320.0,320.0
2,1000035,3.0,1.0,1.0,,,,,,,...,0.462963,1001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000044,2.0,3.0,2.0,3.0,35.0,5.0,12.0,2.0,1.0,...,,1001.0,0.0,2.0,15.0,0.0,0.0,99.0,0.0,0.0
4,1000053,1.0,3.0,3.0,,,,,,,...,,1001.0,1.0,10.0,50.0,0.0,1.0,693.0,240.0,0.0


We ensure that the `FieldID` column in `touchscreen_chars_df` is of integer type. This is important for consistent data handling and merging later on, with the objective to do multiple analysis.

In [1]:
print(touchscreen_chars_df.head())
print(touchscreen_chars_df.shape)
# Ensure the metadata FieldID is integer
touchscreen_chars_df['FieldID'] = touchscreen_chars_df['FieldID'].astype(int)

NameError: name 'touchscreen_chars_df' is not defined