In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Sample data
data = {
    'Education Level': ['High School', 'College', 'Masters', 'PhD'],
    'Color': ['Red', 'Green', 'Blue', 'Red'],
    'Temperature': ['Cold', 'Moderate', 'Hot', 'Moderate'],
    'Zip Code': ['12345', '67890', '54321', '98765'],
    'Country': ['USA', 'Canada', 'USA', None],
    'Target': [0, 1, 1, 0]
}

df = pd.DataFrame(data)

# 1. Label Encoding
label_encoder = LabelEncoder()
df['Education Level Encoded'] = label_encoder.fit_transform(df['Education Level'])

# 2. One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(df[['Color']])
color_encoded_df = pd.DataFrame(onehot_encoded, columns=['Color_Red', 'Color_Green', 'Color_Blue'])#columns is not mandatory if not it gives index number

# 3. Custom Mapping
temperature_mapping = {'Cold': -1, 'Moderate': 0, 'Hot': 1}
df['Temperature Encoded'] = df['Temperature'].map(temperature_mapping)

# Handling Missing Values
df['Country Encoded'] = df['Country'].fillna('Unknown')

#Label Encoding
label_encoder = LabelEncoder()
df['Country Encoded'] = label_encoder.fit_transform(df['Country'])


# 5. Categorical vs. Numerical
df['Zip Code'] = df['Zip Code'].astype(int)

# Data Preprocessing
features = ['Education Level Encoded', 'Temperature Encoded', 'Zip Code', 'Country Encoded']
X = df[features]
y = df['Target']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Training
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.00


In [3]:
df['Education Level Encoded']


0    1
1    0
2    2
3    3
Name: Education Level Encoded, dtype: int64

In [4]:
color_encoded_df

Unnamed: 0,Color_Red,Color_Green,Color_Blue
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0


In [11]:
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(df[['Color']])
color_encoded_df = pd.DataFrame(onehot_encoded)

In [12]:
color_encoded_df

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0


In [17]:
# 3. Custom Mapping
temperature_mapping = {'Cold': -1, 'Moderate': 0, 'Hot': 1} #we assign the value for string for each value in columns values
df['Temperature Encoded'] = df['Temperature'].map(temperature_mapping)

In [18]:
df['Temperature Encoded']

0   -1
1    0
2    1
3    0
Name: Temperature Encoded, dtype: int64

In [23]:
df['Temperature'].value_counts()

Moderate    2
Cold        1
Hot         1
Name: Temperature, dtype: int64

In [24]:
df['Country'].value_counts()

USA       2
Canada    1
Name: Country, dtype: int64

In [25]:
df['Country Encoded'] .value_counts()

1    2
0    1
2    1
Name: Country Encoded, dtype: int64

In [26]:
# 4. Handling Missing Values
df['Country Encoded'] = df['Country'].fillna('Unknown')


In [28]:
df['Country Encoded'].value_counts()

USA        2
Canada     1
Unknown    1
Name: Country Encoded, dtype: int64

In [29]:
onehot_encoded

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [49]:
# 1) label encoding for all data

import pandas as pd
from sklearn.preprocessing import LabelEncoder

data = {
    'Education Level': ['High School', 'College', 'Masters', 'PhD'],
    'Color': ['Red', 'Green', 'Blue', 'Red'],
    'Temperature': ['Cold', 'Moderate', 'Hot', 'Moderate'],
    'Zip Code': ['12345', '67890', '54321', '98765'],
    'Country': ['USA', 'Canada', 'USA', None],
    'Target': [0, 1, 1, 0]
}
# fill the missing value for country

data_frame=pd.DataFrame(data)
data_frame['Country']=data_frame['Country'].fillna('missing')
#data_frame['Country'] = data_frame['Country'].fillna(data_frame['Country'].mode().iloc[0]) # it gives the first value is usa to all missing values



def Encoder(df):
          columnsToEncode = list(df.select_dtypes(include=['category','object']))
          label_encoder = LabelEncoder()
          for feature in columnsToEncode:
              try:
                  df[feature] = label_encoder.fit_transform(df[feature])
              except:
                  print('Error encoding '+feature)
          return df


In [51]:
data_frame['Country']

0        USA
1     Canada
2        USA
3    missing
Name: Country, dtype: object

In [52]:
data_frame

Unnamed: 0,Education Level,Color,Temperature,Zip Code,Country,Target
0,High School,Red,Cold,12345,USA,0
1,College,Green,Moderate,67890,Canada,1
2,Masters,Blue,Hot,54321,USA,1
3,PhD,Red,Moderate,98765,missing,0


In [53]:
label_encode_ransformed=Encoder(data_frame)

In [54]:
label_encode_ransformed

Unnamed: 0,Education Level,Color,Temperature,Zip Code,Country,Target
0,1,2,0,0,1,0
1,0,1,2,2,0,1
2,2,0,1,1,1,1
3,3,2,2,3,2,0


In [55]:
x=label_encode_ransformed.drop("Target",axis=1)
y=label_encode_ransformed["Target"]

In [56]:
x

Unnamed: 0,Education Level,Color,Temperature,Zip Code,Country
0,1,2,0,0,1
1,0,1,2,2,0
2,2,0,1,1,1
3,3,2,2,3,2


In [57]:
y

0    0
1    1
2    1
3    0
Name: Target, dtype: int64

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

# Model Training
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation
accuracy = model.score(X_test, y_test)
accuracy



0.5

In [68]:
model.score(X_train, y_train)

1.0

In [None]:
# it because the data is to low to train so it happen