This Notebook contains the data processing, feature engineering and data preparation of the anomaly dataset based on insights gained from the EDA

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read the data
## Read the data
train = pd.read_csv('https://raw.githubusercontent.com/chimaobi-okite/DSML-Projects/main/FraudDetection/ML-MATT-CompetitionQT1920_train.csv',
                    encoding='windows-1252')
test = pd.read_csv('https://raw.githubusercontent.com/chimaobi-okite/DSML-Projects/main/FraudDetection/ML-MATT-CompetitionQT1920_test.csv',
                   encoding='windows-1252')

In [3]:
train.head()

Unnamed: 0,Time,CellName,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,maxUE_UL,maxUE_UL+DL,Unusual
0,10:45,3BLTE,11.642,1.393,0.37,0.041,15.655,0.644,1.114,1.025,4.0,3.0,7,1
1,9:45,1BLTE,21.791,1.891,0.537,0.268,10.273,1.154,1.353,1.085,6.0,4.0,10,1
2,7:45,9BLTE,0.498,0.398,0.015,0.01,0.262,0.164,0.995,0.995,1.0,1.0,2,1
3,2:45,4ALTE,1.891,1.095,0.94,0.024,60.715,0.825,1.035,0.995,2.0,2.0,4,1
4,3:30,10BLTE,0.303,0.404,0.016,0.013,0.348,0.168,1.011,1.011,2.0,1.0,3,0


#### Data processing and feature engineering steps

1. Fill nan values in maxUE_UL, maxUE_DL, maxUE_DL+UL with zero
2. Replace '#¡VALOR!' in maxUE_DL+UL with zero
3. Extract the base_stations from CellName
4. Extract hour and minute from time column
5. Extract cell number from CellName
6. Drop the Time and Cellname Columns
7. finally get dummies for the base_station column

In [4]:
def process(data):
  data[['maxUE_UL', 'maxUE_DL', 'maxUE_UL+DL']] = data[['maxUE_UL', 'maxUE_DL', 'maxUE_UL+DL']].fillna(0)
  data['maxUE_UL+DL'] = data['maxUE_UL+DL'].replace(to_replace= '#¡VALOR!', value = 0)
  data['maxUE_UL+DL'] = data['maxUE_UL+DL'].apply(lambda x: int(x))
  data['BaseName'] = data['CellName'].map(lambda x : x[-4])
  data['hour'] = data['Time'].map(lambda x : int(x[0:x.index(':')]))
  data['minute'] = data['Time'].map(lambda x : int(x[x.index(':') + 1:]))
  data['cell_code'] = data['CellName'].apply(lambda x : int(x[:-4]))
  data = data.drop(['Time','CellName'], axis = 1)
  data = pd.get_dummies(data)

  return data

In [5]:
processed_train_data = process(train)
processed_train_data.head()

Unnamed: 0,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,maxUE_UL,...,Unusual,hour,minute,cell_code,BaseName_A,BaseName_B,BaseName_C,BaseName_U,BaseName_V,BaseName_W
0,11.642,1.393,0.37,0.041,15.655,0.644,1.114,1.025,4.0,3.0,...,1,10,45,3,0,1,0,0,0,0
1,21.791,1.891,0.537,0.268,10.273,1.154,1.353,1.085,6.0,4.0,...,1,9,45,1,0,1,0,0,0,0
2,0.498,0.398,0.015,0.01,0.262,0.164,0.995,0.995,1.0,1.0,...,1,7,45,9,0,1,0,0,0,0
3,1.891,1.095,0.94,0.024,60.715,0.825,1.035,0.995,2.0,2.0,...,1,2,45,4,1,0,0,0,0,0
4,0.303,0.404,0.016,0.013,0.348,0.168,1.011,1.011,2.0,1.0,...,0,3,30,10,0,1,0,0,0,0


In [6]:
process(test).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9158 entries, 0 to 9157
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PRBUsageUL   9158 non-null   float64
 1   PRBUsageDL   9158 non-null   float64
 2   meanThr_DL   9158 non-null   float64
 3   meanThr_UL   9158 non-null   float64
 4   maxThr_DL    9158 non-null   float64
 5   maxThr_UL    9158 non-null   float64
 6   meanUE_DL    9158 non-null   float64
 7   meanUE_UL    9158 non-null   float64
 8   maxUE_DL     9158 non-null   float64
 9   maxUE_UL     9158 non-null   float64
 10  maxUE_UL+DL  9158 non-null   int64  
 11  hour         9158 non-null   int64  
 12  minute       9158 non-null   int64  
 13  cell_code    9158 non-null   int64  
 14  BaseName_A   9158 non-null   uint8  
 15  BaseName_B   9158 non-null   uint8  
 16  BaseName_C   9158 non-null   uint8  
 17  BaseName_U   9158 non-null   uint8  
 18  BaseName_V   9158 non-null   uint8  
 19  BaseNa

In [7]:
processed_test = process(test)

Save the processed train and test data as a csv to be used for modelling

In [8]:
processed_train_data.to_csv('pr_train.csv', index = False)
processed_test.to_csv('pr_test.csv', index = False)