In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score
from sklearn.impute import KNNImputer
import time
import itertools

import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingRegressor,BaggingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score

#Libraries for visualizing trees
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image

In [3]:
OnlineNewsPopularity = pd.read_csv("data_dropped.csv")

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    OnlineNewsPopularity.drop(['popular'], axis = 1),
    OnlineNewsPopularity['popular'],
    test_size = 0.2,
    random_state = 42)

In [5]:
#Scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sca = scaler.transform(X_train)
X_test_sca = scaler.transform(X_test)

### Ridge/Lasso

In [8]:
model_lasso_base = LogisticRegression(penalty='l1', solver='liblinear').fit(X_train_sca, y_train)
lasso_accuracy = model_lasso_base.score(X_test_sca, y_test)
print("Base lasso model accuracy: ", lasso_accuracy)

Base lasso model accuracy:  0.6761256148316307


In [25]:
param_grid = {'C': [0.01, 0.1, 1.0, 10.0],
              'penalty': ['l1'],
              'solver': ['liblinear', 'saga']
              }

model = LogisticRegression()

grid_search = GridSearchCV(model, param_grid, cv=5).fit(X_train_sca, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_lasso_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_lasso_model.fit(X_train_sca, y_train)
best_accuracy = best_lasso_model.score(X_test_sca, y_test)

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}
Best Accuracy: 0.6753688989784336


In [27]:
param_grid = {'C': [0.01, 0.1, 1.0, 10.0],
              'penalty': ['l1'],
              'solver': ['liblinear', 'saga'],
              'tol': [1e-5, 1e-4, 1e-3]
              }

model = LogisticRegression()

grid_search = GridSearchCV(model, param_grid, cv=5).fit(X_train_sca, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_lasso_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_accuracy = best_lasso_model.score(X_test_sca, y_test)

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'C': 1.0, 'penalty': 'l1', 'solver': 'saga', 'tol': 1e-05}
Best Accuracy: 0.6753688989784336


In [32]:
param_grid = {'C': [0.01, 0.1, 0.5, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 2.0, 5.0, 10.0],
              'penalty': ['l1'],
              'solver': ['liblinear'],
              'tol': [1e-5]
              }

model = LogisticRegression()

grid_search = GridSearchCV(model, param_grid, cv=5).fit(X_train_sca, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_lasso_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_accuracy = best_lasso_model.score(X_test_sca, y_test)

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 1e-05}
Best Accuracy: 0.6759994955227645


In [29]:
model_test = LogisticRegression(penalty='l1', solver='liblinear', tol=1e-05).fit(X_train_sca, y_train)
model_test.score(X_test_sca, y_test)

0.6762517341404969

In [9]:
model_ridge_base = LogisticRegression(penalty='l2').fit(X_train_sca, y_train)
ridge_accuracy = model_ridge_base.score(X_test_sca, y_test)
print("Base ridge model accuracy: ", ridge_accuracy)

Base ridge model accuracy:  0.6763778534493631


In [20]:
param_grid = {'C': [0.01, 0.1, 1.0, 10],
              'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
              }

model = LogisticRegression()

grid_search = GridSearchCV(model, param_grid, cv=5).fit(X_train_sca, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_ridge_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_ridge_model.fit(X_train_sca, y_train)
best_accuracy = best_ridge_model.score(X_test_sca, y_test)

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
Best Accuracy: 0.6754950182872997


In [21]:
param_grid = {'C': [1.0, 10, 100, 1000],
              'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
              }

model = LogisticRegression()

grid_search = GridSearchCV(model, param_grid, cv=5).fit(X_train_sca, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_ridge_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_ridge_model.fit(X_train_sca, y_train)
best_accuracy = best_ridge_model.score(X_test_sca, y_test)

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
Best Accuracy: 0.6753688989784336


In [22]:
param_grid = {'C': [1, 4, 7, 10, 40, 70, 100],
              'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
              }

model = LogisticRegression()

grid_search = GridSearchCV(model, param_grid, cv=5).fit(X_train_sca, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_ridge_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_ridge_model.fit(X_train_sca, y_train)
best_accuracy = best_ridge_model.score(X_test_sca, y_test)

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'C': 4, 'penalty': 'l2', 'solver': 'saga'}
Best Accuracy: 0.6754950182872997


In [24]:
param_grid = {'C': [1, 4, 7, 10, 40, 70, 100],
              'penalty': ['l2'],
              'solver': ['saga'],
              'tol': [1e-5, 1e-4, 1e-3, 1e-2]
              }

model = LogisticRegression()

grid_search = GridSearchCV(model, param_grid, cv=5).fit(X_train_sca, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_ridge_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_ridge_model.fit(X_train_sca, y_train)
best_accuracy = best_ridge_model.score(X_test_sca, y_test)

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga', 'tol': 0.01}
Best Accuracy: 0.672972632109976


### Bagging

In [34]:
model_bag_base = BaggingClassifier(DecisionTreeClassifier(), random_state = 1).fit(X_train_sca, y_train)
bag_accuracy = model_bag_base.score(X_test_sca, y_test)
print("Bagging base model accuracy = ", bag_accuracy)

Bagging base model accuracy =  0.7388069113381258


In [43]:
params = {'n_estimators': [1000, 1500],
          'max_samples': [0.8, 1.0],
          'max_features': [0.8, 1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False],
          }

# Perform a grid search to find the best hyperparameters
cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(BaggingClassifier(base_estimator = DecisionTreeClassifier(random_state = 1),
                                                       random_state=1, n_jobs=-1), 
                                      param_grid=params, cv=cv, n_jobs=-1, verbose=1)
grid_search.fit(X_train_sca, y_train)

Fitting 2 folds for each of 32 candidates, totalling 64 fits


In [41]:
params = {'n_estimators': [100, 500, 1000],
          'max_samples': [0.1, 0.5, 1.0],
          'max_features': [0.1, 0.5, 1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False],
          }

# Get the best model
print("best params: ", grid_search.best_params_)
model_bag = grid_search.best_estimator_

# Evaluate the model on the test set
model_bag.fit(X_train_sca, y_train)
y_pred = model_bag.predict(X_test_sca)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy = ", accuracy)

best params:  {'bootstrap': True, 'bootstrap_features': True, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 1000}
Accuracy =  0.7693277840837432


In [44]:
params = {'n_estimators': [1000, 1500],
          'max_samples': [0.8, 1.0],
          'max_features': [0.8, 1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False],
          }

# Get the best model
print("best params: ", grid_search.best_params_)
model_bag = grid_search.best_estimator_

# Evaluate the model on the test set
model_bag.fit(X_train_sca, y_train)
y_pred = model_bag.predict(X_test_sca)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy = ", accuracy)

best params:  {'bootstrap': False, 'bootstrap_features': True, 'max_features': 0.8, 'max_samples': 0.8, 'n_estimators': 1500}
Accuracy =  0.767814352377349
