# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE

# Loading the datasets

### Change the path of the files accordingly

In [None]:
train_data_file = '/kaggle/input/who-is-the-real-winner/train.csv'
test_data_file = '/kaggle/input/who-is-the-real-winner/test.csv'

train_df = pd.read_csv(train_data_file)
test_df = pd.read_csv(test_data_file)

### This is how our train data looks like!

In [None]:
train_df.head()

### This is how our test data looks like!

In [None]:
test_df.head()

In [None]:
states = np.array(train_df['state'].unique())
parties = np.array(train_df['Party'].unique())

# Data Preprocessing

### Define functions for data preprocessing

In [None]:
def convert_to_numeric(value):
    if 'Crore' in value:
        return int(float(value.replace(' Crore+', '')) * 10000000)
    elif 'Lac' in value:
        return int(float(value.replace(' Lac+', '')) * 100000)
    elif 'Thou' in value:
        return int(float(value.replace(' Thou+', '')) * 1000)
    elif 'Hund' in value:
        return int(float(value.replace(' Hund+', '')) * 100)
    elif value == '0':
        return int(0)

def preprocess_data(df):
    # Convert string data to numeric data
    df['Total Assets'] = df['Total Assets'].apply(convert_to_numeric)
    df['Liabilities'] = df['Liabilities'].apply(convert_to_numeric)
    df['Criminal Case'] = df['Criminal Case'].apply(lambda x: 1 if x > 0 else 0)
    # One hot encode categorical data
    coded1 = pd.get_dummies(df['Party']).astype(int)
    coded2 = pd.get_dummies(df['state']).astype(int)
    df = pd.concat([df, coded1, coded2], axis=1)
    
    # Drop the unwanted columns
    df = df.drop(['ID', 'Party', 'state'], axis = 1)
    
    return df

def encode_education(df):
    # label encode education
    # higher degree of education is marked by a higher number
    label_mapping = {'Others': 0, 'Literate': 1, '5th Pass': 2, '8th Pass': 3, '10th Pass': 4,
                 '12th Pass': 5, 'Graduate': 6, 'Graduate Professional': 7,
                 'Post Graduate': 8, 'Doctorate': 9}
    df['Education'] = df['Education'].map(label_mapping)
    return df

def decode_education(arr):
    # Decode the education back for submission
    label_mapping = {0: 'Others', 1: 'Literate', 2: '5th Pass', 3: '8th Pass', 4: '10th Pass',
                     5: '12th Pass', 6: 'Graduate', 7: 'Graduate Professional',
                     8: 'Post Graduate', 9: 'Doctorate'}
    return np.vectorize(label_mapping.get)(arr)

### Preprocess the train and test data.

In [None]:
train_df = preprocess_data(train_df)
train_df = encode_education(train_df)
test_df = preprocess_data(test_df)

### This is how our train data looks like after preprocessing!

In [None]:
train_df.head()

### This is how our test data looks like after preprocessing!

In [None]:
test_df.head()

# Feature Engineering

### Define functions for making new features

In [None]:
# make new features
def make_features(df):
    df['ALRatio'] = df['Total Assets']/(df['Liabilities']+1) # Add 1 to handle 0 values
    df['Net Assets'] = df['Total Assets'] - df['Liabilities']
    
    df['Advocate'] = df['Candidate'].str.contains('Adv.', case=False).astype(int)
    df['Dr'] = df['Candidate'].str.contains('Dr.', case=False).astype(int)
    df = df.drop(['Candidate'], axis=1)
    return df

def scale_features(df):
    scaler = MinMaxScaler()
#     scaler = StandardScaler()
    df["Scaled Total Assets"] = scaler.fit_transform(df[['Total Assets']])
    df["Scaled Liabilities"] = scaler.fit_transform(df[['Liabilities']])
    df["Scaled Net Assets"] = scaler.fit_transform(df[['Net Assets']])
    return df

def log_features(df):
    # Scale logarithmically
    take_log = lambda value: 0 if value == 0 else np.log(value)
    df['Log Total Assets'] = df['Total Assets'].apply(take_log)
    df['Log Liabilities'] = df['Liabilities'].apply(take_log)
    df['Log AL Ratio'] = df['Log Total Assets'] - df['Log Liabilities']
    return df

def group_states(df):
    region_mapping = {
        'South': ['TAMIL NADU', 'KARNATAKA', 'KERALA', 'ANDHRA PRADESH', 'PUDUCHERRY'],
        'North': ['UTTAR PRADESH', 'PUNJAB', 'HARYANA', 'DELHI', 'HIMACHAL PRADESH', 'RAJASTHAN', 'UTTARAKHAND'],
        'West': ['MAHARASHTRA', 'GUJARAT', 'GOA'],
        'East': ['WEST BENGAL', 'ODISHA', 'JHARKHAND', 'BIHAR'],
        'North East': ['MEGHALAYA', 'MANIPUR', 'NAGALAND', 'SIKKIM', 'TRIPURA', 'ASSAM', 'ARUNACHAL PRADESH'],
        'Central': ['MADHYA PRADESH', 'CHHATTISGARH']
    }

    for region, states in region_mapping.items():
        df[region] = df[states].sum(axis=1)  # Sum the columns for each region and add as a new column to df

    return df

def group_constituencies(df):
    categorize = lambda constituency: 'SC' if 'SC' in constituency else ('ST' if 'ST' in constituency else 'General')
    df["Category"] = df['Constituency ∇'].apply(categorize)
    
    # One hot encode Category
    coded = pd.get_dummies(df['Category']).astype(int)
    df = pd.concat([df, coded], axis=1)
    
    # Drop the unwanted columns
    df = df.drop(['Category', 'Constituency ∇'], axis = 1)
    
    return df

def group_regional_parties(df):
    region_mapping = {
        'North Parties': ['AAP', 'JD(U)', 'SHS', 'TDP'],
        'South Parties': ['AIADMK', 'DMK', 'JD(S)', 'YSRCP', 'Sikkim Krantikari Morcha', 'Tipra Motha Party', 'SP'],
        'East Parties': ['AITC', 'BJD', 'CPI', 'CPI(M)', 'RJD'],
        'West Parties': ['NCP', 'NDPP'],
        'North East Parties': ['IND', 'JMM', 'NPP'],
        'Nationwide Parties': ['BJP', 'INC'],
    }
    
    for region, parties in region_mapping.items():
        df[region] = df[parties].sum(axis=1)  # Sum the columns for each region and add as a new column to df

    return df

### Make new features from existing features

In [None]:
train_df = make_features(train_df)
train_df = scale_features(train_df)
train_df = log_features(train_df)
train_df = group_states(train_df)
train_df = group_constituencies(train_df)
train_df = group_regional_parties(train_df)

test_df = make_features(test_df)
test_df = scale_features(test_df)
test_df = log_features(test_df)
test_df = group_states(test_df)
test_df = group_constituencies(test_df)
test_df = group_regional_parties(test_df)

#### This is how our train data looks after adding new features

In [None]:
train_df.head()

#### This is how our test data looks after adding new features

In [None]:
test_df.head()

### Let us see the correlation of various features with Education

#### We can divide features into two categories - Numerical and Cagtegorical

In [None]:
all_features = train_df.columns.tolist()
all_features.remove('Education')

numerical_features = ['Total Assets', 'Liabilities', 'Net Assets', 'ALRatio',
                      'Scaled Total Assets', 'Scaled Liabilities', 'Scaled Net Assets', 
                      'Log Total Assets', 'Log Liabilities', 'Log AL Ratio']

categorical_features = [feature for feature in all_features if feature not in numerical_features]

print("Numerical Features:", numerical_features)
print()
print("Categorical Features:", categorical_features)

#### Correlation of Education with Numerical Features

In [None]:
num_corr = train_df[numerical_features].corrwith(train_df['Education'])
print(num_corr)

#### Correlation of Education with categorical Features

In [None]:
categ_corr = train_df[categorical_features].corrwith(train_df['Education'])
n = categ_corr.shape[0]
m = n//2
print(categ_corr.iloc[0:m])
print(categ_corr.iloc[m:n])

#### We can observethat Scaling does not affect the correlation, but it may help in decreasing training time. So let us keep only the scaled features and remove the original ones

In [None]:
unwanted_features = np.array(['Criminal Case', 'Total Assets', 'Liabilities'])

### Now we have multiple options - 
#### Option 1 : Remove individual states and only keep grouped states
#### Option 2 : Remove individual parties and only keep grouped parties
#### Option 3 : Remove state and parties with a very high or very less number of entries in the dataset.
#### Option 4 : Use sklearn.feature_selection to select k best features

#### We can also use a combination of these options

##### Option 1

In [None]:
unwanted_features = np.concatenate((unwanted_features, states))

##### Option 2

In [None]:
unwanted_features = np.concatenate((unwanted_features, parties))

##### Option 3

In [None]:
train_copy = pd.read_csv(train_data_file)
print(train_copy['Party'].value_counts())
print(train_copy['state'].value_counts())

In [None]:
drop_parties = np.array(['Tipra Motha Party', 'CPI', 'Sikkim Krantikari Morcha', 
               'JD(S)', 'TDP', 'NDPP', 'BJP', 'INC', 'JMM'])


unwanted_features = np.concatenate((unwanted_features, drop_parties))

In [None]:
drop_states = np.array(['UTTAR PRADESH','WEST BENGAL','MAHARASHTRA','MADHYA PRADESH',
                        'KARNATAKA','TAMIL NADU','BIHAR','GUJARAT','RAJASTHAN','SIKKIM',
                        'GOA','MEGHALAYA','PUDUCHERRY'])

unwanted_features = np.concatenate((unwanted_features, drop_states))

##### Option 4

In [None]:
train_df = train_df.drop(unwanted_features, axis = 1)
test_df = test_df.drop(unwanted_features, axis = 1)
unwanted_features = np.array([])
# Separate the features and target variable
X = train_df.drop('Education', axis=1)
y = train_df['Education']

# Select k best features using SelectKBest and f_classif
k = 40  # Number of features you want to select
selector = SelectKBest(score_func=f_classif, k=k if X.shape[1] >= k else X.shape[1])
X_new = selector.fit_transform(X, y)

# Get the list of selected features
selected_features_mask = selector.get_support()
selected_features = np.array(X.columns[selected_features_mask])

# Get the list of features that were not selected
not_selected_features = np.array(X.columns[~selected_features_mask])

unwanted_features = np.concatenate((unwanted_features, not_selected_features))

print("Selected Features:", selected_features.tolist())
print()
print("Not Selected Features:", not_selected_features.tolist())

##### Let us look at all the unwanted features

In [None]:
print(unwanted_features)

#### Drop the unwanted features

In [None]:
train_df = train_df.drop(unwanted_features, axis = 1)
test_df = test_df.drop(unwanted_features, axis = 1)

#### This is how our train data looks like after removing the unwanted features

In [None]:
train_df.head()

#### This is how our test data looks like after removing the unwanted features

In [None]:
test_df.head()

# Oversampling using SMOTE

In [None]:
X = train_df.drop(['Education'], axis=1)
y = train_df['Education']

# Define the sampling strategy for SMOTE
sampling_strategy = {0:100, 1:60, 2:40, 3:1000, 4:1000, 5:1000, 6:1000, 7:1000, 8:1000, 9:1000}

# Initialize SMOTE with the specified sampling strategy
smote = SMOTE(sampling_strategy=sampling_strategy)

# Apply SMOTE to generate synthetic samples
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new DataFrame with the resampled data
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['Education'] = y_resampled

# Print the value counts of the target variable after resampling
print(resampled_df['Education'].value_counts())

In [None]:
# Copy resampled data to train data
train_df = resampled_df

In [None]:
# Concatenate resampled data to train data
train_df = pd.concat([train_df, resampled_df], ignore_index=True)

#### This is how our training data looks like after oversampling

In [None]:
print(train_df.head())
print(train_df.shape)

# Hyperparameter Tuning : Finding best params

In [None]:
# Get a copy of the training data
df = train_df.copy()

# Calculating prior for BernoulliNB
# Calculate class frequencies from y_train
class_frequencies = df['Education'].value_counts(normalize=True)
# Calculate class probabilities
class_probabilities = class_frequencies.sort_index().values

classifiers_list = [
    {'classifier': BernoulliNB(),
     'param_grid': {
         'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 
                   1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4],
         'force_alpha': [True, False],
         'fit_prior': [True, False],
         'class_prior': [None, class_probabilities],
     }},
    {'classifier': RandomForestClassifier(random_state=42),
     'param_grid': {
         'n_estimators': [55, 60, 65],
         'max_depth': [14, 15, 16],
         'criterion': ['gini', 'entropy'],
         'min_samples_split': [9, 10, 11],
         'min_samples_leaf': [0.001, 0.01, 0.1],
         'random_state': [42]
     }},
    {'classifier': DecisionTreeClassifier(random_state=42),
     'param_grid': {
         'max_depth': [14, 15, 16],
         'criterion': ['gini', 'entropy'],
         'splitter': ['best', 'random'],
         'min_samples_split': [9, 10, 11],
         'min_samples_leaf': [0.001, 0.01, 0.1],
         'random_state': [42]
     }},
    {'classifier': KNeighborsClassifier(),
     'param_grid': {
         'n_neighbors': [3, 5, 7],
         'weights': ['uniform', 'distance'],
         'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
     }},
    {'classifier': SVC(),
     'param_grid': {
         'C': [10,100,1000],
         'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
         'degree': [1,2,3],
         'gamma': ['scale', 'auto'],
         'random_state': [42]
     }},
    
]

In [None]:
X_train = df.drop(['Education'], axis=1)
y_train = df['Education']

results = []
best_models = []

for item in classifiers_list:
    clf = item['classifier']
    param_grid = item['param_grid']
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_weighted')
    grid_search.fit(X_train, y_train)
    results.append({'classifier': clf.__class__.__name__, 'best_params': grid_search.best_params_, 'best_score': grid_search.best_score_})
    best_models.append(grid_search.best_estimator_)

for idx, result in enumerate(results, start=1):
    print(idx)
    print(f"Classifier: {result['classifier']}")
    print(f"Best Params: {result['best_params']}")
    print(f"F1 Score: {result['best_score']}")
    print()

# Training, validating the models with best params and creating submissions files

In [None]:
def train_validate_submit(df, model, filename):
    X = df.drop(['Education'], axis=1)
    y = df['Education']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf = model
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    
    model_name = clf.__class__.__name__
    model_params = clf.get_params()
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"Classifier: {model_name}")
    print(f"Best Params: {model_params}")
    print(f"F1 Score: {weighted_f1}")
    print()
    
    y_pred = clf.predict(test_df)
    y_pred = decode_education(y_pred)
    filename = filename
    df_submit = pd.DataFrame({'ID' : range(len(y_pred)), 'Education' : y_pred})
    df_submit.to_csv(f'{filename}.csv', index=False)

In [None]:
mode = '' # enter the trial number for distinguishing multiple submissions of same model

In [None]:
df = train_df.copy()
for model in best_models:
    filename = model.__class__.__name__ + mode
    train_validate_submit(df,model, filename)