# ML with Logistic Regression

In this notebook, we will use the functions in the file logistic_regression.py. 

In [80]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.proj1_helpers import *
from functions.split import *
from functions.regularized_logistic_regression import *
from functions.helpers import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


First, we load the cleaned data

In [58]:
DATA_TRAIN_PATH = 'data/train_jet_0.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
tX, y = prepare_log_reg(tX, y)


We split the data, just to see if we can predict something

In [81]:
ratio = 0.8
x_train, y_train, x_test, y_test = split_data(tX, y, ratio)

In [99]:
degree = 2
tX_train = build_poly(x_train, degree)
tX_test = build_poly(x_test, degree)

We plot the graph Loss vs Epochs while doing the Logistic Regression

In [103]:
# Define the parameters of the algorithm.
max_iters = 2000
gamma = 1e-5
lamb = 3

# Initialization
losses, ws = regularized_logistic_regression(y_train, tX_train, gamma, lamb, max_iters, False, True)

  Iter=0, loss=327655.77779208287, diff=327655.77779208287
  Iter=100, loss=38032.29900531236, diff=-289623.4787867705
  Iter=200, loss=35972.26584620967, diff=-2060.0331591026843
  Iter=300, loss=35256.91953348131, diff=-715.3463127283612
  Iter=400, loss=34916.4053713624, diff=-340.51416211891046
  Iter=500, loss=34717.89937216304, diff=-198.50599919936212
  Iter=600, loss=34587.08661311497, diff=-130.81275904807262
  Iter=700, loss=34494.50364880016, diff=-92.58296431480994
  Iter=800, loss=34426.1164275884, diff=-68.3872212117567
  Iter=900, loss=34374.1724711948, diff=-51.943956393603
  Iter=1000, loss=34333.90519819148, diff=-40.267273003315495
  Iter=1100, loss=34302.169622095724, diff=-31.735576095758006
  Iter=1200, loss=34276.79547447193, diff=-25.37414762379194
  Iter=1300, loss=34256.24062372796, diff=-20.554850743974384
  Iter=1400, loss=34239.386510601114, diff=-16.854113126843004
  Iter=1500, loss=34225.40883952129, diff=-13.977671079825086
  Iter=1600, loss=34213.692029

In [104]:
w_star, min_loss = get_best_model(losses, ws)
print("Min loss = %f"%(min_loss))

Min loss = 34181.671653


In [106]:
prediction_log(y_test, tX_test, w_star)  

Good prediction: 16024/19983 (80.188160%)
Wrong prediction: 3959/19983 (19.811840%)


## Generate predictions and save ouput in csv format for submission:

We retrain on all the data

In [None]:
# Define the parameters of the algorithm.
max_iters = 600
gamma = 1e-7

# Initialization
losses, ws = logistic_regression(y, tX, gamma, max_iters)
w_star, min_loss = get_best_model(losses, ws)
print("Min loss = %f"%(min_loss))

In [None]:
DATA_TEST_PATH = 'data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test, mean_tX_test, std_tX_test = standardize(tX_test)

In [None]:
OUTPUT_PATH = 'output/LR_GD.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w_star, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)