**Introduction to Data Processing**

Types of Data

In [None]:
import pandas as pd

# Example Data
data = pd.DataFrame({
    'Continuous': [180.5, 165.2, 172.1],
    'Discrete': [1, 2, 3],
    'Categorical': ['Red', 'Green', 'Blue']
})

print(data)


   Continuous  Discrete Categorical
0       180.5         1         Red
1       165.2         2       Green
2       172.1         3        Blue


**Feature Scaling**

**Absolute Maximum Scaling**

In [None]:
import numpy as np

X = np.array([[5], [-10], [15]])
X_scaled = X / np.max(np.abs(X))
print(X_scaled)


[[ 0.33333333]
 [-0.66666667]
 [ 1.        ]]


**Min-Max Scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler

X = np.array([[10], [20], [30]])
scaler = MinMaxScaler()
X_minmax = scaler.fit_transform(X)
print(X_minmax)


[[0. ]
 [0.5]
 [1. ]]


** Normalization (L2)**

In [None]:
from sklearn.preprocessing import Normalizer

X = np.array([[1, 2], [2, 4]])
normalizer = Normalizer(norm='l2')
X_norm = normalizer.fit_transform(X)
print(X_norm)


[[0.4472136  0.89442719]
 [0.4472136  0.89442719]]


**Standardization**

In [None]:
from sklearn.preprocessing import StandardScaler

X = np.array([[10], [20], [30]])
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
print(X_std)


[[-1.22474487]
 [ 0.        ]
 [ 1.22474487]]


**Importing Dataset**

In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv"
df = pd.read_csv(url, header=None)

# Show first 5 rows
print(df.head())


   0      1     2     3     4    5     6     7     8     9     10    11    12  \
0   1  14.23  1.71  2.43  15.6  127  2.80  3.06  0.28  2.29  5.64  1.04  3.92   
1   1  13.20  1.78  2.14  11.2  100  2.65  2.76  0.26  1.28  4.38  1.05  3.40   
2   1  13.16  2.36  2.67  18.6  101  2.80  3.24  0.30  2.81  5.68  1.03  3.17   
3   1  14.37  1.95  2.50  16.8  113  3.85  3.49  0.24  2.18  7.80  0.86  3.45   
4   1  13.24  2.59  2.87  21.0  118  2.80  2.69  0.39  1.82  4.32  1.04  2.93   

     13  
0  1065  
1  1050  
2  1185  
3  1480  
4   735  


**Handling Missing Data**

In [None]:
import pandas as pd
import numpy as np

data = pd.DataFrame({
    'Temperature': [22, np.nan, 25, np.nan, 28],
    'Humidity': [30, 45, np.nan, 50, 55]
})

# Fill missing with mean
data_filled = data.fillna(data.mean(numeric_only=True))
print(data_filled)


   Temperature  Humidity
0         22.0      30.0
1         25.0      45.0
2         25.0      45.0
3         25.0      50.0
4         28.0      55.0


**Categorical Data**

In [None]:
import pandas as pd

data = pd.DataFrame({
    'Color': ['Red', 'Green', 'Blue', 'Red']
})

# One-Hot Encoding
encoded_data = pd.get_dummies(data)
print(encoded_data)


   Color_Blue  Color_Green  Color_Red
0       False        False       True
1       False         True      False
2        True        False      False
3       False        False       True


**Introduction (Why Preprocessing?)**

In [None]:
# Real-world scenario: messy survey data
import pandas as pd

raw_data = pd.DataFrame({
    'Age': [25, None, 30, 28],
    'Gender': ['Male', 'Female', None, 'Male'],
    'Salary': [50000, 60000, None, 58000]
})

print("Raw Data:")
print(raw_data)


Raw Data:
    Age  Gender   Salary
0  25.0    Male  50000.0
1   NaN  Female  60000.0
2  30.0    None      NaN
3  28.0    Male  58000.0


**More on Data Types**

In [None]:
# Using dtype and category
data = pd.DataFrame({
    'Student ID': [101, 102, 103],
    'Score': [85.5, 90.0, 76.0],
    'Grade': pd.Series(['A', 'A+', 'B'], dtype="category")
})

print("Data Types:")
print(data.dtypes)
print("\nData Preview:")
print(data)


Data Types:
Student ID       int64
Score          float64
Grade         category
dtype: object

Data Preview:
   Student ID  Score Grade
0         101   85.5     A
1         102   90.0    A+
2         103   76.0     B


**Feature Scaling – More Examples**

**Absolute Maximum Scaling (multiple features)**

In [None]:
import numpy as np

X = np.array([[2, 20], [-4, 15], [10, -30]])
X_scaled = X / np.max(np.abs(X), axis=0)
print(np.round(X_scaled, 2))


[[ 0.2   0.67]
 [-0.4   0.5 ]
 [ 1.   -1.  ]]


**Importing Dataset (CSV from Google Drive)**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Example path
file_path = '/content/forestfires.csv'  # Upload a CSV first
df = pd.read_csv(file_path)
print(df.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   X  Y month  day  FFMC   DMC     DC  ISI  temp  RH  wind  rain  area
0  7  5   mar  fri  86.2  26.2   94.3  5.1   8.2  51   6.7   0.0   0.0
1  7  4   oct  tue  90.6  35.4  669.1  6.7  18.0  33   0.9   0.0   0.0
2  7  4   oct  sat  90.6  43.7  686.9  6.7  14.6  33   1.3   0.0   0.0
3  8  6   mar  fri  91.7  33.3   77.5  9.0   8.3  97   4.0   0.2   0.0
4  8  6   mar  sun  89.3  51.3  102.2  9.6  11.4  99   1.8   0.0   0.0


**Handling Missing Data (Advanced)**

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Age': [25, np.nan, 30, 28],
    'Income': [40000, 50000, np.nan, 60000]
})

# Strategy: Fill age with median, income with mean
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df['Income'].mean())

print(df)


    Age   Income
0  25.0  40000.0
1  28.0  50000.0
2  30.0  50000.0
3  28.0  60000.0


**Categorical Data (Label Encoding)**

In [None]:
from sklearn.preprocessing import LabelEncoder

df = pd.DataFrame({
    'Fruit': ['Apple', 'Banana', 'Mango', 'Apple']
})

le = LabelEncoder()
df['Fruit_Label'] = le.fit_transform(df['Fruit'])

print(df)


    Fruit  Fruit_Label
0   Apple            0
1  Banana            1
2   Mango            2
3   Apple            0


**Combined Scaling + Encoding Example**

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.DataFrame({
    'Age': [20, 30, 40],
    'City': ['Delhi', 'Mumbai', 'Bangalore']
})

# Column Transformer Pipeline
ct = ColumnTransformer(transformers=[
    ('num', StandardScaler(), ['Age']),
    ('cat', OneHotEncoder(), ['City'])
])

output = ct.fit_transform(df)
print(output.toarray() if hasattr(output, 'toarray') else output)


[[-1.22474487  0.          1.          0.        ]
 [ 0.          0.          0.          1.        ]
 [ 1.22474487  1.          0.          0.        ]]
