In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt

# Function to select the top 'n' features using the chi-squared statistic.
def selectkbest(indep_X, dep_Y, n):
    test = SelectKBest(score_func=chi2, k=n)  # Initialize SelectKBest with chi2 scoring and 'n' features.
    fit1 = test.fit(indep_X, dep_Y)  # Fit the SelectKBest model to the independent and dependent variables.
    selectk_features = fit1.transform(indep_X)  # Transform the independent variables to include only the selected features.
    return selectk_features  # Return the transformed independent variables.

# Function to split the data into training and testing sets and scale them.
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)  # Split the data into 75% training and 25% testing sets.
    sc = StandardScaler()  # Initialize the StandardScaler for feature scaling.
    X_train = sc.fit_transform(X_train)  # Scale the training independent variables.
    X_test = sc.transform(X_test)  # Scale the testing independent variables using the same scaler.
    return X_train, X_test, y_train, y_test  # Return the scaled training and testing sets.

# Function to calculate the R-squared score for a regression model.
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)  # Predict the dependent variable using the regression model.
    from sklearn.metrics import r2_score  # Import the R-squared score function.
    r2 = r2_score(y_test, y_pred)  # Calculate the R-squared score.
    return r2  # Return the R-squared score.

# Function to train a Linear Regression model and return its R-squared score.
def Linear(X_train, y_train, X_test):
    from sklearn.linear_model import LinearRegression  # Import the Linear Regression model.
    regressor = LinearRegression()  # Initialize the Linear Regression model.
    regressor.fit(X_train, y_train)  # Train the Linear Regression model.
    r2 = r2_prediction(regressor, X_test, y_test)  # Calculate the R-squared score.
    return r2  # Return the R-squared score.

# Function to train a Linear Support Vector Regression model and return its R-squared score.
def svm_linear(X_train, y_train, X_test):
    from sklearn.svm import SVR  # Import the Support Vector Regression model.
    regressor = SVR(kernel='linear')  # Initialize the Linear SVR model.
    regressor.fit(X_train, y_train)  # Train the Linear SVR model.
    r2 = r2_prediction(regressor, X_test, y_test)  # Calculate the R-squared score.
    return r2  # Return the R-squared score.

# Function to train a Non-Linear Support Vector Regression model and return its R-squared score.
def svm_NL(X_train, y_train, X_test):
    from sklearn.svm import SVR  # Import the Support Vector Regression model.
    regressor = SVR(kernel='rbf')  # Initialize the Non-Linear SVR model (RBF kernel).
    regressor.fit(X_train, y_train)  # Train the Non-Linear SVR model.
    r2 = r2_prediction(regressor, X_test, y_test)  # Calculate the R-squared score.
    return r2  # Return the R-squared score.

# Function to train a Decision Tree Regression model and return its R-squared score.
def Decision(X_train, y_train, X_test):
    from sklearn.tree import DecisionTreeRegressor  # Import the Decision Tree Regression model.
    regressor = DecisionTreeRegressor(random_state=0)  # Initialize the Decision Tree Regression model.
    regressor.fit(X_train, y_train)  # Train the Decision Tree Regression model.
    r2 = r2_prediction(regressor, X_test, y_test)  # Calculate the R-squared score.
    return r2  # Return the R-squared score.

# Function to train a Random Forest Regression model and return its R-squared score.
def random(X_train, y_train, X_test):
    from sklearn.ensemble import RandomForestRegressor  # Import the Random Forest Regression model.
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)  # Initialize the Random Forest Regression model.
    regressor.fit(X_train, y_train)  # Train the Random Forest Regression model.
    r2 = r2_prediction(regressor, X_test, y_test)  # Calculate the R-squared score.
    return r2  # Return the R-squared score.

# Function to create a DataFrame summarizing the R-squared scores of different regression models.
def selectk_regression(acclin, accsvml, accsvmnl, accdes, accrf):
    dataframe = pd.DataFrame(index=['ChiSquare'], columns=['Linear', 'SVMl', 'SVMnl', 'Decision', 'Random'])  # Create an empty DataFrame.
    for number, idex in enumerate(dataframe.index):
        dataframe.loc[idex, 'Linear'] = acclin[number]  # Populate the DataFrame with R-squared scores for Linear Regression.
        dataframe.loc[idex, 'SVMl'] = accsvml[number]  # Populate the DataFrame with R-squared scores for Linear SVR.
        dataframe.loc[idex, 'SVMnl'] = accsvmnl[number]  # Populate the DataFrame with R-squared scores for Non-Linear SVR.
        dataframe.loc[idex, 'Decision'] = accdes[number]  # Populate the DataFrame with R-squared scores for Decision Tree Regression.
        dataframe.loc[idex, 'Random'] = accrf[number]  # Populate the DataFrame with R-squared scores for Random Forest Regression.
    return dataframe  # Return the populated DataFrame.


In [2]:
# Load the dataset from a CSV file.
dataset1 = pd.read_csv("prep.csv", index_col=None)

# Create a copy of the dataset.
df2 = dataset1

# Convert categorical variables into dummy/indicator variables.
df2 = pd.get_dummies(df2, drop_first=True)

# Separate the independent and dependent variables.
indep_X = df2.drop('classification_yes', axis=1)  # Drop the target column to get independent variables.
dep_Y = df2['classification_yes']  # Assign the target column to the dependent variable.


In [12]:
# Select the top 5 features using the chi-squared statistic.
kbest = selectkbest(indep_X, dep_Y, 3)

# Initialize lists to store R-squared scores for different models.
acclin = []
accsvml = []
accsvmnl = []
accdes = []
accrf = []

In [None]:
# Split the data into training and testing sets and scale them.
X_train, X_test, y_train, y_test = split_scalar(kbest, dep_Y)

# Iterate over the selected features and calculate R-squared scores for each model.
for i in kbest:
    r2_lin = Linear(X_train, y_train, X_test)  # Calculate R-squared for Linear Regression.
    acclin.append(r2_lin)  # Append the R-squared score to the list.

    r2_sl = svm_linear(X_train, y_train, X_test)  # Calculate R-squared for Linear SVR.
    accsvml.append(r2_sl)  # Append the R-squared score to the list.

    r2_NL = svm_NL(X_train, y_train, X_test)  # Calculate R-squared for Non-Linear SVR.
    accsvmnl.append(r2_NL)  # Append the R-squared score to the list.

    r2_d = Decision(X_train, y_train, X_test)  # Calculate R-squared for Decision Tree Regression.
    accdes.append(r2_d)  # Append the R-squared score to the list.

    r2_r = random(X_train, y_train, X_test)  # Calculate R-squared for Random Forest Regression.
    accrf.append(r2_r)  # Append the R-squared score to the list.

# Create a DataFrame summarizing the R-squared scores.
result = selectk_regression(acclin, accsvml, accsvmnl, accdes, accrf)


In [5]:
result
#5

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.551985,0.545395,0.749654,0.696181,0.836806


In [11]:
result
#7

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.657035,0.641906,0.893007,0.826389,0.916233
