In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
# Replace 'path_to_your_dataset.csv' with the actual path to your dataset file
data = pd.read_csv('logistic_regression_better_dataset.csv')

# Splitting the dataset into features (X) and target (y)
X = data.drop('Purchase', axis=1)
y = data['Purchase']

# Splitting the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Creating and training the logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Making predictions on the test set
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]  # Probabilities for AUC-ROC

# Calculating performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

# Print out the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc_roc)


Accuracy: 0.73
Precision: 0.73
Recall: 1.0
F1 Score: 0.8439306358381503
AUC-ROC: 0.5294266869609334


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('logistic_regression_better_dataset.csv')

# Feature Engineering: Creating interaction terms or polynomial features
# Example: interaction between Age and Income
data['Age_Income_Interaction'] = data['Age'] * data['Income']

# Data Preprocessing: Normalization
scaler = StandardScaler()
feature_cols = ['Age', 'Income', 'EducationYears', 'EmploymentStatus', 'Age_Income_Interaction']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Splitting the dataset into features (X) and target (y)
X = data.drop('Purchase', axis=1)
y = data['Purchase']

# Splitting the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Hyperparameter Tuning: Grid Search for Logistic Regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Making predictions on the test set
y_pred = best_model.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Improved Accuracy:", accuracy)


Improved Accuracy: 0.71


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chris\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chris\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\chris\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('logistic_regression_better_dataset.csv')

# Feature Engineering: Creating interaction terms or polynomial features
data['Age_Income_Interaction'] = data['Age'] * data['Income']

# Data Preprocessing: Normalization
scaler = StandardScaler()
feature_cols = ['Age', 'Income', 'EducationYears', 'EmploymentStatus', 'Age_Income_Interaction']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Splitting the dataset into features (X) and target (y)
X = data.drop('Purchase', axis=1)
y = data['Purchase']

# Splitting the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Hyperparameter Tuning: Grid Search for Logistic Regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Making predictions on the test set
y_pred = best_model.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Improved Accuracy:", accuracy)


Improved Accuracy: 0.71


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
data = pd.read_csv('logistic_regression_better_dataset.csv')

# Feature Engineering: Creating interaction terms or polynomial features
data['Age_Income_Interaction'] = data['Age'] * data['Income']

# Data Preprocessing: Normalization
scaler = StandardScaler()
feature_cols = ['Age', 'Income', 'EducationYears', 'EmploymentStatus', 'Age_Income_Interaction']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Splitting the dataset into features (X) and target (y)
X = data.drop('Purchase', axis=1)
y = data['Purchase']

# Splitting the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Hyperparameter Tuning: Grid Search for Logistic Regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Making predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for AUC-ROC

# Calculating performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

# Print out the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc_roc)


Accuracy: 0.71
Precision: 0.7444444444444445
Recall: 0.9178082191780822
F1 Score: 0.8220858895705521
AUC-ROC: 0.6118721461187214


In [11]:
#Logistic Regression on a large dataset
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
data = pd.read_csv('logistic_regression_large_dataset.csv')

# Feature Engineering: Creating interaction terms or polynomial features
data['Age_Income_Interaction'] = data['Age'] * data['Income']

# Data Preprocessing: Normalization
scaler = StandardScaler()
feature_cols = ['Age', 'Income', 'EducationYears', 'EmploymentStatus', 'Age_Income_Interaction']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Splitting the dataset into features (X) and target (y)
X = data.drop('Purchase', axis=1)
y = data['Purchase']

# Splitting the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Hyperparameter Tuning: Grid Search for Logistic Regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Making predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for AUC-ROC

# Calculating performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

# Print out the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc_roc)


Accuracy: 0.665
Precision: 0.6893371757925072
Recall: 0.9012810851544838
F1 Score: 0.7811887655127367
AUC-ROC: 0.669054308112121


In [13]:
#Logistic Regression with prediction output
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the training dataset
data = pd.read_csv('logistic_regression_large_dataset.csv')

# Feature Engineering: Creating interaction terms or polynomial features for training data
data['Age_Income_Interaction'] = data['Age'] * data['Income']

# Data Preprocessing: Normalization for training data
scaler = StandardScaler()
feature_cols = ['Age', 'Income', 'EducationYears', 'EmploymentStatus', 'Age_Income_Interaction']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Splitting the training dataset into features (X) and target (y)
X = data.drop('Purchase', axis=1)
y = data['Purchase']

# Splitting the training data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Hyperparameter Tuning: Grid Search for Logistic Regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Making predictions on the training test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for AUC-ROC

# Calculating performance metrics for the training test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

# Print out the performance metrics for the training test set
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc_roc)

# Load the separate test dataset
test_data = pd.read_csv('logistic_regression_new_test_dataset.csv')

# Feature Engineering for the test data
test_data['Age_Income_Interaction'] = test_data['Age'] * test_data['Income']

# Data Preprocessing: Normalization for the test data
# Apply the same scaler used for the training data
test_data[feature_cols] = scaler.transform(test_data[feature_cols])

# Making predictions on the separate test dataset
test_predictions = best_model.predict(test_data[feature_cols])

# Append the predictions to the test dataset
test_data['Predicted_Purchase'] = test_predictions

# Save the updated test dataset with predictions to a new CSV file
test_data.to_csv('logistic_regression_test_with_predictions.csv', index=False)


Accuracy: 0.665
Precision: 0.6893371757925072
Recall: 0.9012810851544838
F1 Score: 0.7811887655127367
AUC-ROC: 0.6690509489167156


In [15]:
#L1 Regularized
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load your dataset
data = pd.read_csv('logistic_regression_large_dataset.csv')  # Replace 'your_dataset.csv' with your dataset file path

# Feature Engineering: Create interaction terms or other features as needed
# Example: data['Age_Income_Interaction'] = data['Age'] * data['Income']

# Data Preprocessing: Normalization
scaler = StandardScaler()
feature_cols = ['Age', 'Income', 'EducationYears', 'EmploymentStatus']  # Add other features as needed
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Splitting the dataset into features (X) and target (y)
X = data.drop('Purchase', axis=1)
y = data['Purchase']

# Splitting the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create and train an L1-regularized logistic regression model
log_reg = LogisticRegression(penalty='l1', solver='liblinear')
log_reg.fit(X_train, y_train)

# Making predictions on the test set
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]  # Probabilities for AUC-ROC

# Calculating performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

# Print out the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc_roc)


Accuracy: 0.665
Precision: 0.6893371757925072
Recall: 0.9012810851544838
F1 Score: 0.7811887655127367
AUC-ROC: 0.6691293301428441
