# DAY 3

### Basic Functions and Modules

In [20]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load Iris Dataset
iris = load_iris()
X = iris.data
y = iris.target

# Convert to Dataframe for easier handing
df = pd.DataFrame(data=X, columns= iris.feature_names)
df['target'] = y

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

print('Training feature shape: ', X_train.shape)
print('Training labels shape: ', X_train.shape)
print('Training feature shape: ', X_train.shape)
print('Training labels shape: ', X_train.shape)

# Standarize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Initialize the model
model = LogisticRegression(random_state=42)

#Train the model
model.fit(X_train_scaled, y_train)

#Make predictions
y_pred = model.predict(X_test_scaled)

#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names =iris.target_names)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n",report)

Training feature shape:  (120, 4)
Training labels shape:  (120, 4)
Training feature shape:  (120, 4)
Training labels shape:  (120, 4)
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



### Data Preprocessing

#### 1. Handling Missing Data

In [30]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Sample dataset with mising values
data = {
    'A' : [1,2,None,4],
    'B' : [None,2,3,4],
    'C' : [1,None,3,4]
}

df = pd.DataFrame(data)

# Dropping Missing Values
# Drop rows with missing values
df_dropped = df.dropna()
print('After droping rows with missing values:\n ',df_dropped)

# Drop columns with missing values
df_dropped_cols = df.dropna(axis=1)
print('\n After dropping colums with missing values: \n',df_dropped_cols)

# Replacing Missing Values with Mean
# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print('\n After imputing missing values: \n',df_imputed)


After droping rows with missing values:
       A    B    C
3  4.0  4.0  4.0

 After dropping colums with missing values: 
 Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]

 After imputing missing values: 
           A    B         C
0  1.000000  3.0  1.000000
1  2.000000  2.0  2.666667
2  2.333333  3.0  3.000000
3  4.000000  4.0  4.000000


#### 2. Feature Scaling

In [33]:
# Standardization
# Sample dataset
data = {
    'A' : [1,2,3,4],
    'B' : [2,3,4,5]
}
df = pd.DataFrame(data)

# Standarize features
scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
print('\nAfter Standarization: \n',df_standardized)

# Normalization
from sklearn.preprocessing import MinMaxScaler
# Normalize features to the range [0,1]
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df),columns = df.columns)
print('\nAfter Normalization: \n',df_standardized)


After Standarization: 
           A         B
0 -1.341641 -1.341641
1 -0.447214 -0.447214
2  0.447214  0.447214
3  1.341641  1.341641

After Normalization: 
           A         B
0 -1.341641 -1.341641
1 -0.447214 -0.447214
2  0.447214  0.447214
3  1.341641  1.341641


#### Encoding Categorical Data

In [34]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
# Sample dataset with categorical data
data = {'Category': ['A','B','A','C']}
df = pd.DataFrame(data)

# Encode categorical data as numbers
label_encoder = LabelEncoder()
df['Category_Encoder'] = label_encoder.fit_transform(df['Category'])
print('\nAfter Label Encoding: \n',df)



After Label Encoding: 
   Category  Category_Encoder
0        A                 0
1        B                 1
2        A                 0
3        C                 2


#### One Hot Encoding

In [39]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
data = {'Category': ['A', 'B', 'A', 'C']}
df = pd.DataFrame(data)
# One-hot encode categorical data
one_hot_encoder = OneHotEncoder (sparse_output=False)
encoded = one_hot_encoder.fit_transform(df [[ 'Category']])
# Create a DataFrame with one-hot encoded columns
encoded_df = pd.DataFrame (encoded, columns=one_hot_encoder.get_feature_names_out(['Category']))
df_one_hot_encoded = pd.concat([df, encoded_df], axis=1)
print("\nAfter one-hot encoding: \n", df_one_hot_encoded)


After one-hot encoding: 
   Category  Category_A  Category_B  Category_C
0        A         1.0         0.0         0.0
1        B         0.0         1.0         0.0
2        A         1.0         0.0         0.0
3        C         0.0         0.0         1.0


### Splitting and Training Data

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split # Sample dataset
data = {
    'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Feature2': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
     'Target': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
# Sample dataset
data = {
    'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Feature2': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'Target': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)
#Features and target variable
X = df[['Feature1', 'Feature2']] # Features
y=df['Target'] # Target variable
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training features: \n", X_train)
print("Training labels: \n", y_train)
print("Testing labels: \n", y_test)
print("Testing features: \n", X_test)

Training features: 
    Feature1  Feature2
5         6        16
0         1        11
7         8        18
2         3        13
9        10        20
4         5        15
3         4        14
6         7        17
Training labels: 
 5    1
0    0
7    1
2    0
9    1
4    0
3    1
6    0
Name: Target, dtype: int64
Testing labels: 
 8    0
1    1
Name: Target, dtype: int64
Testing features: 
    Feature1  Feature2
8         9        19
1         2        12


In [53]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Load the Iris dataset
iris
load_iris()
X = iris.data
y = iris.target
#Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training features shape:", X_train.shape) 
print("Testing features shape:", X_test.shape) 
print("Training labels shape:", y_train.shape) 
print("Testing labels shape:", y_test.shape)

Training features shape: (120, 4)
Testing features shape: (30, 4)
Training labels shape: (120,)
Testing labels shape: (30,)
