# 0. Data Preprocessing

In [1]:
import pandas as pd
import sys
sys.path.insert(1, '../src/')
from process_data import Preprocessing

In [2]:
mov_fast = pd.read_csv('../data/processed/movement_fast_stat.csv')
mov_slow = pd.read_csv('../data/processed/movement_slow_stat.csv')
traffic_fast = pd.read_csv('../data/processed/traffic_fast_stat.csv')
traffic_slow = pd.read_csv('../data/processed/traffic_slow_stat.csv')

In [3]:
if "time_25%" in mov_fast.columns:
    mov_fast = mov_fast.drop(columns=["time_25%", "time_75%", "time_max", "time_mean", "time_min", "time_std", "time_50%"])
if "time_25%" in mov_slow.columns:
    mov_slow = mov_slow.drop(columns=["time_25%", "time_75%", "time_max", "time_mean", "time_min", "time_std", "time_50%"])

## 0.1. Fix Features Naming

In [4]:
mov_fast = Preprocessing.fix_feature_naming(mov_fast)
mov_slow = Preprocessing.fix_feature_naming(mov_slow)
traffic_fast = Preprocessing.fix_feature_naming(traffic_fast)
traffic_slow = Preprocessing.fix_feature_naming(traffic_slow)

## 0.2. Columns Variability 

In [5]:
def find_non_varying_variables(df):
    non_varying_columns = []
    variability_percentage = []
    
    for column in df.columns:
        unique_count = df[column].nunique()
        total_count = len(df[column])
        variability = unique_count / total_count * 100
        
        if unique_count == 1:
            non_varying_columns.append(column)
            variability_percentage.append(variability)
    
    result_df = pd.DataFrame({'Variable': non_varying_columns, 'Variability Percentage': variability_percentage})
    return result_df

In [6]:
find_non_varying_variables(mov_fast)

Unnamed: 0,Variable,Variability Percentage


In [7]:
find_non_varying_variables(mov_slow)

Unnamed: 0,Variable,Variability Percentage


In [8]:
find_non_varying_variables(traffic_fast)

Unnamed: 0,Variable,Variability Percentage
0,size_min,0.027778
1,size_25%,0.027778
2,size_50%,0.027778
3,size_75%,0.027778


In [9]:
find_non_varying_variables(traffic_slow)

Unnamed: 0,Variable,Variability Percentage
0,size_min,0.027785
1,size_25%,0.027785
2,size_75%,0.027785


for the movement data, the data acquired from sensor 0, 1, 2, touchbutton,and remotebuttons are constants which will have to drop from our dataset, to focus only on meangful variables, that can help us make better predicition, for the traffic data, the only constant data are the ones related to packets size, it's important first to understand why those features are constans accross all the participants/games

In [10]:
mov_fast = Preprocessing.drop_non_varying_variables(mov_fast)
mov_slow = Preprocessing.drop_non_varying_variables(mov_slow)
traffic_fast = Preprocessing.drop_non_varying_variables(traffic_fast)
traffic_slow = Preprocessing.drop_non_varying_variables(traffic_slow)

## 0.3. Missing Values

In [11]:
Preprocessing.missing_columns(mov_fast)

Unnamed: 0,Missing Count,Missing Count Ratio,Missing Count %


In [12]:
Preprocessing.missing_columns(mov_slow)

Unnamed: 0,Missing Count,Missing Count Ratio,Missing Count %


In [13]:
Preprocessing.missing_columns(traffic_fast)

Unnamed: 0,Missing Count,Missing Count Ratio,Missing Count %


In [14]:
Preprocessing.missing_columns(traffic_slow)

Unnamed: 0,Missing Count,Missing Count Ratio,Missing Count %


## 0.4. Feature Scaling

In [15]:
mov_fast = Preprocessing.match_columns(mov_slow, mov_fast)
traffic_fast = Preprocessing.match_columns(traffic_slow, traffic_fast)

In [16]:
mov_slow, scaler = Preprocessing.scaling(mov_slow)
mov_fast, _ = Preprocessing.scaling(mov_fast, scaler)
traffic_slow, scaler = Preprocessing.scaling(traffic_slow)
traffic_fast, _ = Preprocessing.scaling(traffic_fast, scaler)

## 0.5. Label Encoding

In [17]:
mov_fast = Preprocessing.encoding(mov_fast)
mov_slow = Preprocessing.encoding(mov_slow)
traffic_fast = Preprocessing.encoding(traffic_fast)
traffic_slow = Preprocessing.encoding(traffic_slow)

## 0.6. Matching Columns
The fast movement dataset and slow movement dataset should have the same number of columns and the same exact features, as we are going to train on the slow movement dataset and predict on the fast movement

In [18]:
print('mov_slow Features shape: ', mov_slow.shape)
print('mov_fast Features shape: ', mov_fast.shape)
print('traffic_slow Features shape: ', traffic_slow.shape)
print('traffic_fast Features shape: ', traffic_fast.shape)

mov_slow Features shape:  (3597, 464)
mov_fast Features shape:  (3600, 464)
traffic_slow Features shape:  (3599, 30)
traffic_fast Features shape:  (3600, 30)


In [19]:
mov_fast = Preprocessing.match_columns(mov_slow, mov_fast)
traffic_fast = Preprocessing.match_columns(traffic_slow, traffic_fast)

In [20]:
print('mov_slow Features shape: ', mov_slow.shape)
print('mov_fast Features shape: ', mov_fast.shape)
print('traffic_slow Features shape: ', traffic_slow.shape)
print('traffic_fast Features shape: ', traffic_fast.shape)

mov_slow Features shape:  (3597, 464)
mov_fast Features shape:  (3600, 464)
traffic_slow Features shape:  (3599, 30)
traffic_fast Features shape:  (3600, 30)


## 0.7 Preparing Labels

In [21]:
mov_fast = Preprocessing.prepare_label(mov_fast)
mov_slow = Preprocessing.prepare_label(mov_slow)
traffic_fast = Preprocessing.prepare_label(traffic_fast)
traffic_slow = Preprocessing.prepare_label(traffic_slow)

In [22]:
mov_fast.to_csv('../data/processed/movement_fast_stat_cleaned.csv')
mov_slow.to_csv('../data/processed/movement_slow_stat_cleaned.csv')
traffic_fast.to_csv('../data/processed/traffic_fast_stat_cleaned.csv')
traffic_slow.to_csv('../data/processed/traffic_slow_stat_cleaned.csv')