In [36]:
#1.Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [37]:
#2.Importing the dataset
df = pd.read_csv('/content/DataPreprocessingGraded_dataset.csv')
print (df)

       V1    V2       V3    V4        V5 Target
0     2.0  50.0  12500.0  98.0  NEGATIVE    YES
1     0.0  13.0   3250.0  28.0  NEGATIVE    YES
2       ?     ?   4000.0  35.0  NEGATIVE    YES
3       ?  20.0   5000.0  45.0  NEGATIVE    YES
4     1.0  24.0   6000.0  77.0  NEGATIVE     NO
..    ...   ...      ...   ...       ...    ...
743  23.0   2.0    500.0  38.0  NEGATIVE     NO
744  21.0   2.0    500.0  52.0  NEGATIVE     NO
745  23.0   3.0    750.0  62.0  NEGATIVE     NO
746  39.0   1.0    250.0  39.0  NEGATIVE     NO
747  72.0   1.0    250.0  72.0  NEGATIVE     NO

[748 rows x 6 columns]


In [38]:
#3.Drop Duplicate data
df.duplicated().sum()

215

In [39]:
df = df.drop_duplicates()

In [40]:
df.duplicated().sum()

0

In [41]:
#4.Taking care of missing data
df.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
Target    0
dtype: int64

In [42]:
#5.Encoding categorical data
one_hot_encoded_data = pd.get_dummies(df)
print(one_hot_encoded_data)

          V3    V4  V1_0.0  V1_1.0  V1_10.0  V1_11.0  V1_12.0  V1_13.0  \
0    12500.0  98.0       0       0        0        0        0        0   
1     3250.0  28.0       1       0        0        0        0        0   
2     4000.0  35.0       0       0        0        0        0        0   
3     5000.0  45.0       0       0        0        0        0        0   
4     6000.0  77.0       0       1        0        0        0        0   
..       ...   ...     ...     ...      ...      ...      ...      ...   
743    500.0  38.0       0       0        0        0        0        0   
744    500.0  52.0       0       0        0        0        0        0   
745    750.0  62.0       0       0        0        0        0        0   
746    250.0  39.0       0       0        0        0        0        0   
747    250.0  72.0       0       0        0        0        0        0   

     V1_14.0  V1_15.0  ...  V2_5.0  V2_50.0  V2_6.0  V2_7.0  V2_8.0  V2_9.0  \
0          0        0  ...      

In [43]:
#6.Normalizing the data
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
scaled_data = scaler.fit_transform(one_hot_encoded_data)
scaled_df = pd.DataFrame(scaled_data,
                         columns=one_hot_encoded_data.columns)
print(scaled_df.head())

         V3        V4    V1_0.0    V1_1.0  V1_10.0  V1_11.0  V1_12.0  V1_13.0  \
0  0.999969  0.007840  0.000000  0.000000      0.0      0.0      0.0      0.0   
1  0.999963  0.008615  0.000308  0.000000      0.0      0.0      0.0      0.0   
2  0.999962  0.008750  0.000000  0.000000      0.0      0.0      0.0      0.0   
3  0.999959  0.009000  0.000000  0.000000      0.0      0.0      0.0      0.0   
4  0.999918  0.012832  0.000000  0.000167      0.0      0.0      0.0      0.0   

   V1_14.0  V1_15.0  ...  V2_5.0  V2_50.0  V2_6.0  V2_7.0  V2_8.0  V2_9.0  \
0      0.0      0.0  ...     0.0  0.00008     0.0     0.0     0.0     0.0   
1      0.0      0.0  ...     0.0  0.00000     0.0     0.0     0.0     0.0   
2      0.0      0.0  ...     0.0  0.00000     0.0     0.0     0.0     0.0   
3      0.0      0.0  ...     0.0  0.00000     0.0     0.0     0.0     0.0   
4      0.0      0.0  ...     0.0  0.00000     0.0     0.0     0.0     0.0   

      V2_?  V5_NEGATIVE  Target_NO  Target_YES  
0

In [44]:
#7.Handling Imbalance data
from sklearn.utils import resample

minority_class = one_hot_encoded_data[one_hot_encoded_data['V3'] == 'minority_class']
majority_class = one_hot_encoded_data[one_hot_encoded_data['V3'] == 'majority_class']

# Downsample the majority class
majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)

# Combine the downsampled majority class with the minority class
balanced_data = pd.concat([minority_class, majority_downsampled])

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533 entries, 0 to 747
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      533 non-null    object 
 1   V2      533 non-null    object 
 2   V3      533 non-null    float64
 3   V4      533 non-null    float64
 4   V5      533 non-null    object 
 5   Target  533 non-null    object 
dtypes: float64(2), object(4)
memory usage: 29.1+ KB


In [46]:
#8.Splitting the data into test and train
from sklearn.model_selection import train_test_split

# split into 70:30 ration
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# describes info about train and test set
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (523, 5)
Number transactions y_train dataset:  (523,)
Number transactions X_test dataset:  (225, 5)
Number transactions y_test dataset:  (225,)
