<a href="https://colab.research.google.com/github/gulshan0201/Machine-Learning-Practical-/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Display the first few rows and info for train dataset
print("Train Dataset Head:")
print(train_df.head())
print("\nTrain Dataset Info:")
print(train_df.info())

# Display the first few rows and info for test dataset
print("\nTest Dataset Head:")
print(test_df.head())
print("\nTest Dataset Info:")
print(test_df.info())

Train Dataset Head:
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  

Train Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 

In [None]:
# Function to process the dataframe
def process_dataframe(df):
    df_processed = df.copy()

    # Binary encoding
    binary_cols = ['default', 'housing', 'loan', 'y']
    for col in binary_cols:
        df_processed[col] = df_processed[col].map({'yes': 1, 'no': 0})

    # Ordinal encoding for education
    education_map = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
    df_processed['education'] = df_processed['education'].map(education_map)

    # Mapping months to numbers
    month_map = {
        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
        'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }
    df_processed['month'] = df_processed['month'].map(month_map)

    # Feature Engineering: pdays
    # Create a flag if the client was contacted before
    df_processed['pdays_contacted'] = (df_processed['pdays'] != -1).astype(int)

    # One-hot encoding for nominal categorical variables
    nominal_cols = ['job', 'marital', 'contact', 'poutcome']
    df_processed = pd.get_dummies(df_processed, columns=nominal_cols, drop_first=False)

    return df_processed

# Process both datasets
train_processed = process_dataframe(train_df)
test_processed = process_dataframe(test_df)

# Align columns to ensure both have the same set of dummy variables
# Get all columns from both
all_columns = set(train_processed.columns) | set(test_processed.columns)

# Add missing columns with 0
for col in all_columns:
    if col not in train_processed.columns:
        train_processed[col] = 0
    if col not in test_processed.columns:
        test_processed[col] = 0

# Sort columns to be consistent
train_processed = train_processed.reindex(sorted(train_processed.columns), axis=1)
test_processed = test_processed.reindex(sorted(test_processed.columns), axis=1)

# Check the first few rows of the processed train data
print("Processed Train Data Head:")
print(train_processed.head())
print(train_processed.info())

# Save to CSV
train_processed.to_csv('train_features.csv', index=False)
test_processed.to_csv('test_features.csv', index=False)

Processed Train Data Head:
   age  balance  campaign  contact_cellular  contact_telephone  \
0   58     2143         1             False              False   
1   44       29         1             False              False   
2   33        2         1             False              False   
3   47     1506         1             False              False   
4   33        1         1             False              False   

   contact_unknown  day  default  duration  education  ...  marital_single  \
0             True    5        0       261          3  ...           False   
1             True    5        0       151          2  ...            True   
2             True    5        0        76          2  ...           False   
3             True    5        0        92          0  ...           False   
4             True    5        0       198          0  ...            True   

   month  pdays  pdays_contacted  poutcome_failure  poutcome_other  \
0      5     -1                0     

In [None]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def create_new_features(df):
    df_new = df.copy()

    # Feature 1: Age Group (Categorical)
    # Binning age into meaningful categories
    bins = [0, 30, 50, 100]
    labels = ['Young', 'Middle-Aged', 'Senior']
    df_new['age_group'] = pd.cut(df_new['age'], bins=bins, labels=labels)

    # Feature 2: Total Contacts (Numerical)
    # Combining current campaign contacts and previous contacts
    df_new['total_contacts'] = df_new['campaign'] + df_new['previous']

    # Feature 3: Debt Level (Ordinal/Numerical)
    # Summing up the "yes" indicators for debt-related columns
    # First map 'yes'/'no' to 1/0 for the relevant columns
    debt_cols = ['default', 'housing', 'loan']
    for col in debt_cols:
        # We use a temporary variable or direct mapping for calculation
        # assuming the original columns are 'yes'/'no'
        df_new[col + '_num'] = df_new[col].map({'yes': 1, 'no': 0})

    df_new['debt_level'] = df_new['default_num'] + df_new['housing_num'] + df_new['loan_num']

    # Drop the temporary numerical columns created for calculation
    df_new.drop(columns=[col + '_num' for col in debt_cols], inplace=True)

    return df_new[['age_group', 'total_contacts', 'debt_level']]

# Extract features
new_features_train = create_new_features(train_df)
new_features_test = create_new_features(test_df)

# Display the head of the new features
print("New Features (Train):")
print(new_features_train.head())
print("\nNew Features (Test):")
print(new_features_test.head())

# Concatenate with original processed features for a complete set
# Load previous processed features to append (optional, but good practice to show integration)
train_processed = pd.read_csv('train_features.csv')
test_processed = pd.read_csv('test_features.csv')

# Since train_processed has one-hot encoded fields, we just add the new numerical/ordinal ones.
# For age_group, we might need to one-hot encode it if we were strictly doing ML prep,
# but for "extraction" keeping it categorical is fine for inspection.
# Let's just save the new features to a separate file or merge them.
# Merging is better.

train_final = pd.concat([train_processed, new_features_train], axis=1)
test_final = pd.concat([test_processed, new_features_test], axis=1)

print("\nCombined Dataset Head (Train):")
print(train_final.head())

# Save
train_final.to_csv('train_features_updated.csv', index=False)
test_final.to_csv('test_features_updated.csv', index=False)

New Features (Train):
     age_group  total_contacts  debt_level
0       Senior               1           1
1  Middle-Aged               1           1
2  Middle-Aged               1           2
3  Middle-Aged               1           1
4  Middle-Aged               1           0

New Features (Test):
     age_group  total_contacts  debt_level
0        Young               1           0
1  Middle-Aged               5           2
2  Middle-Aged               2           1
3        Young               4           2
4       Senior               1           1

Combined Dataset Head (Train):
   age  balance  campaign  contact_cellular  contact_telephone  \
0   58     2143         1             False              False   
1   44       29         1             False              False   
2   33        2         1             False              False   
3   47     1506         1             False              False   
4   33        1         1             False              False   

   contact_