In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append('../')
sys.path.append('../MoitraRohatgi/')
import algorithms
import auditor_tools
from sklearn.linear_model import LinearRegression
import time
import matplotlib.pyplot as plt

In [2]:
martinez_path = "../../data/martinez.csv"
martinez = pd.read_csv(martinez_path)

In [3]:
# get dependent var
Y = martinez["lngdp14"].to_numpy()

# grab only the columns we care about, and
# reorder columns so that lndn13_fiw is last since
# this is the coefficient whose sign we care about.
# we are following Martinez, equation 6
keys = martinez.columns.to_list()
keys.remove("lndn13_fiw")
X = martinez[keys[4:] + ["lndn13_fiw"]].to_numpy()

In [4]:
# confirm that we have the correct dataset by running linear regression
# and validating against Martinez paper table 2, column 4. First few entries of 
# printed regression coefficients should correspond to entries of this column in descending order
# (There are a lot of coefficients listed here because we made one-hot encodings of all the categorical variables)

# note that LinearRegression().fit(...) defaults to intercept=True -- we will need to run
# our auditors with intercept=true below.

reg = LinearRegression().fit(X, Y)
reg.coef_

array([ 2.14052260e-01, -1.51343225e-02,  1.50905471e-03, -2.35535267e+10,
       -2.31168683e+10, -2.30757580e+10,  1.15446316e+00,  4.11103192e+07,
        4.77768669e+08, -3.42086554e-01,  4.77768676e+08,  4.11103202e+07,
        4.77768670e+08,  4.77768677e+08,  4.77768675e+08, -4.43245518e+00,
        1.51866734e+00,  4.77768675e+08,  4.77768677e+08,  4.11103217e+07,
        4.11103209e+07,  4.77768673e+08,  4.77768671e+08,  4.11103150e+07,
        2.89191532e+00,  4.77768670e+08,  4.77768673e+08,  4.77768677e+08,
        4.77768669e+08, -4.41996002e+00, -2.29566455e+00,  4.77768674e+08,
        4.11103214e+07,  4.77768677e+08,  4.77768675e+08,  4.77768681e+08,
        3.13211370e+00,  2.86605036e+00,  3.44196844e+00,  3.64392853e+00,
        4.11103209e+07,  4.11103260e+07,  4.11103179e+07,  4.77768674e+08,
        4.77768677e+08, -2.75433159e+00,  4.77768671e+08,  4.77768677e+08,
        4.77768677e+08, -1.22609687e+00,  4.77768669e+08,  4.77768676e+08,
        4.77768675e+08, -

In [6]:
print("Integer Programming (1 min cutoff):")
timer = time.time()

# get fractional bound, also fractional weights to warm-start integral solve
bound_frac, val_frac, w, model = auditor_tools.solve_regression_fractional(X,Y, 
                                intercept=True,time_limit=15, verbose=True)

print('time taken: ', time.time()-timer)

# bound, val, w, model = auditor_tools.solve_regression_fractional(X,Y,intercept=True,time_limit=3000)

Integer Programming (1 min cutoff):
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-04
set residual constraints
Set parameter NonConvex to value 2
Set parameter TimeLimit to value 15
start solving
Gurobi Optimizer version 9.5.2 build v9.5.2rc0 (mac64[rosetta2])
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads
Optimize a model with 7790 rows, 4106 columns and 7790 nonzeros
Model fingerprint: 0x3e6c8bb7
Model has 212 quadratic constraints
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  QMatrix range    [5e-06, 1e+03]
  QLMatrix range   [6e-02, 1e+03]
  Objective range  [1e+00, 1e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [1e+00, 1e+00]
Presolve removed 7790 rows and 0 columns

Continuous model is non-convex -- solving as a MIP

Found heuristic solution: objective -0.0000000
Presolve removed 7790 rows and 0 columns
Presolve time: 0.07s
Presolved: 102452 rows, 29667 columns, 425278 nonzeros
Presol

In [12]:
ws = []
for i in range(len(Y)):
    if model.getVars()[i].X > 0.9:
        ws.append(1)
    else:
        ws.append(0)
        
vals = [model.getVars()[i].X for i in range(len(Y))]

In [13]:
algorithms.ols(X,Y,ws)

array([ 3.07778278e-01,  8.54284900e-03, -5.42671112e-03,  2.44745154e+01,
        2.61323661e+01,  2.68172179e+01,  1.03479564e+00,  2.67397257e-01,
       -4.86430857e+00, -3.21980261e-01,  2.53809922e+00,  1.30549191e+00,
       -3.81105340e+00,  3.58260143e+00,  8.84939522e-01, -4.49692177e+00,
        1.51554456e+00,  6.73712297e-01,  3.34236283e+00,  2.87116572e+00,
        1.94465513e+00, -7.51721268e-01, -2.19007659e+00, -3.90771464e+00,
        2.93630424e+00, -3.26327146e+00, -1.83501235e-01,  3.31800283e+00,
       -4.81281631e+00, -4.42744453e+00, -2.31298512e+00,  7.31801213e-01,
        2.58156681e+00,  3.20641017e+00,  1.43447463e+00,  7.40230563e+00,
        3.08667214e+00,  2.86590846e+00,  3.42813512e+00,  3.60889729e+00,
        2.05922843e+00,  7.13099568e+00, -1.01191679e+00,  2.45714161e-01,
        3.02780808e+00, -2.67312736e+00, -2.33948093e+00,  3.25773438e+00,
        2.97167328e+00, -1.23020118e+00, -4.28109853e+00,  2.65399456e+00,
        1.17145709e+00, -

In [14]:
(len(X)-sum(ws))/len(X)

0.028241335044929396

In [33]:

# do the integral solve
bound, val, w, model = auditor_tools.solve_regression_integral(X,-Y,intercept=True,time_limit=30,
                                                               warm_start=ws,verbose=True)

print("upper bound: " + str((len(Y) - val)))
print("lower bound: " + str((len(Y) - bound_frac))) # fractional bound usually better than integral bound
print("total time: " + str(time.time() - timer))

KeyboardInterrupt: 

In [58]:
# double-check that we have the right dataset by 
# running a vanilla regression of lngdp14 against
# the other (non-categorical) columns of X

warm_start = []
for i in range(3895):
    if model.getVars()[i].X > .999:
        warm_start.append(1)
    else:
        warm_start.append(0)

print(sum(warm_start))
        
keep = []
for i in range(3895):
    if warm_start[i] == 1:
        keep.append(i)

# for i in range(len(keep)-1):
#     print(i)
#     newkeep = keep[:i] + keep[(i+1):]
#     reg = LinearRegression().fit(X[newkeep,:], Y[newkeep])
#     print(reg.coef_[0])
#     if reg.coef_[0] <= 0:
#         print("success")
#         break
        

reg = LinearRegression().fit(X[keep,:], Y[keep])
reg.coef_


3785


array([ 3.07841234e-01,  8.52390985e-03, -5.42445205e-03, -1.63816202e+10,
       -1.61188251e+10, -1.61134892e+10,  1.03454286e+00,  5.33583191e+06,
        2.68130924e+08, -3.22177649e-01,  2.68130931e+08,  5.33583295e+06,
        2.68130925e+08,  2.68130932e+08,  2.68130929e+08, -4.49703383e+00,
        1.51553321e+00,  2.68130929e+08,  2.68130932e+08,  5.33583452e+06,
        5.33583359e+06,  2.68130928e+08,  2.68130926e+08,  5.33582774e+06,
        2.93624914e+00,  2.68130925e+08,  2.68130928e+08,  2.68130932e+08,
        2.68130924e+08, -4.42755497e+00, -2.31300330e+00,  2.68130929e+08,
        5.33583423e+06,  2.68130932e+08,  2.68130930e+08,  2.68130936e+08,
        3.08663476e+00,  2.86586425e+00,  3.42809272e+00,  3.60893369e+00,
        5.33583371e+06,  5.33583878e+06,  5.33583063e+06,  2.68130929e+08,
        2.68130932e+08, -2.67321873e+00,  2.68130926e+08,  2.68130932e+08,
        2.68130932e+08, -1.23023105e+00,  2.68130924e+08,  2.68130931e+08,
        2.68130930e+08, -

In [88]:
warm_start = []
for i in range(3895):
    if model.getVars()[i].X > 0.9:
        warm_start.append(1)
    else:
        warm_start.append(0)
    
ws = warm_start

In [97]:
bound, val, w, model = auditor_tools.solve_regression_integral(X,Y,intercept=True,time_limit=30000,warm_start=ws)

Set parameter NonConvex to value 2
Set parameter TimeLimit to value 30000
Gurobi Optimizer version 10.0.0 build v10.0.0rc2 (mac64[rosetta2])

CPU model: Apple M1
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 1 rows, 4107 columns and 1 nonzeros
Model fingerprint: 0xe85cdefd
Model has 212 quadratic constraints
Variable types: 212 continuous, 3895 integer (3895 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  QMatrix range    [5e-06, 1e+03]
  QLMatrix range   [6e-02, 1e+03]
  Objective range  [1e+00, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [0e+00, 0e+00]

User MIP start produced solution with objective 3785 (0.04s)
Loaded user MIP start with objective 3785

Presolve removed 1 rows and 0 columns
Presolve time: 0.07s
Presolved: 57596 rows, 90183 columns, 389164 nonzeros
Presolved model has 57384 SOS constraint(s)
Variable types: 57596 continuous, 32587 integer (32587 binary)
Deterministic concurrent 

In [92]:
keep = []
for i in range(3895):
    if warm_start[i] == 1:
        keep.append(i)



reg = LinearRegression().fit(X[keep,:], Y[keep])

In [93]:
reg.coef_

array([ 3.07841234e-01,  8.52390985e-03, -5.42445205e-03, -1.63816202e+10,
       -1.61188251e+10, -1.61134892e+10,  1.03454286e+00,  5.33583191e+06,
        2.68130924e+08, -3.22177649e-01,  2.68130931e+08,  5.33583295e+06,
        2.68130925e+08,  2.68130932e+08,  2.68130929e+08, -4.49703383e+00,
        1.51553321e+00,  2.68130929e+08,  2.68130932e+08,  5.33583452e+06,
        5.33583359e+06,  2.68130928e+08,  2.68130926e+08,  5.33582774e+06,
        2.93624914e+00,  2.68130925e+08,  2.68130928e+08,  2.68130932e+08,
        2.68130924e+08, -4.42755497e+00, -2.31300330e+00,  2.68130929e+08,
        5.33583423e+06,  2.68130932e+08,  2.68130930e+08,  2.68130936e+08,
        3.08663476e+00,  2.86586425e+00,  3.42809272e+00,  3.60893369e+00,
        5.33583371e+06,  5.33583878e+06,  5.33583063e+06,  2.68130929e+08,
        2.68130932e+08, -2.67321873e+00,  2.68130926e+08,  2.68130932e+08,
        2.68130932e+08, -1.23023105e+00,  2.68130924e+08,  2.68130931e+08,
        2.68130930e+08, -

In [94]:
reg2 = LinearRegression().fit(X, Y)

In [96]:
reg2.coef_

array([ 2.14052260e-01, -1.51343225e-02,  1.50905471e-03, -2.35535267e+10,
       -2.31168683e+10, -2.30757580e+10,  1.15446316e+00,  4.11103192e+07,
        4.77768669e+08, -3.42086554e-01,  4.77768676e+08,  4.11103202e+07,
        4.77768670e+08,  4.77768677e+08,  4.77768675e+08, -4.43245518e+00,
        1.51866734e+00,  4.77768675e+08,  4.77768677e+08,  4.11103217e+07,
        4.11103209e+07,  4.77768673e+08,  4.77768671e+08,  4.11103150e+07,
        2.89191532e+00,  4.77768670e+08,  4.77768673e+08,  4.77768677e+08,
        4.77768669e+08, -4.41996002e+00, -2.29566455e+00,  4.77768674e+08,
        4.11103214e+07,  4.77768677e+08,  4.77768675e+08,  4.77768681e+08,
        3.13211370e+00,  2.86605036e+00,  3.44196844e+00,  3.64392853e+00,
        4.11103209e+07,  4.11103260e+07,  4.11103179e+07,  4.77768674e+08,
        4.77768677e+08, -2.75433159e+00,  4.77768671e+08,  4.77768677e+08,
        4.77768677e+08, -1.22609687e+00,  4.77768669e+08,  4.77768676e+08,
        4.77768675e+08, -

In [98]:
X.shape

(3895, 211)

In [4]:
auditor_tools.spectral_certify(X,Y)

  WM2[i,:] = identity_coeff * vec_square + phi_coeff * np.dot(phi,vec_square) * phi / d


6.423225243162111e-11

In [61]:
print("KZC21:")
timer = time.time()
print("upper bound: " + str(algorithms.sensitivity(np.flip(X,axis=1),-Y)))
print("total time: " + str(time.time() - timer))

KZC21:
upper bound: 173
total time: 107.2057638168335
