In [243]:
import pyspark
from pyspark.sql import *
from pyspark import SparkContext, SparkConf

from pyspark.sql import *
from pyspark.sql.functions import *

from pyspark.sql.types import StructType, StructField, IntegerType, StringType


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('..')
from src.utils import MyUtils

import os.path
from os import listdir
from os.path import isfile, join
import sys

import subprocess
import yaml
import sqlite3

if (sys.modules.get('src.logistic_regression') is not None): 
    del sys.modules['src.logistic_regression']
import src.logistic_regression as logic

if (sys.modules.get('src.people_generator') is not None): 
    del sys.modules['src.people_generator']
import src.people_generator

# Add the parent directory to the system path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

print(parent_dir)

/home/ho-yu


In [244]:
print("Current working directory set to:", os.getcwd())

# Read the config.yaml file
if os.path.exists('config.yaml'):
    with open('config.yaml', 'r') as file:
        config = yaml.safe_load(file)

if os.path.exists('../config.yaml'):
    with open('../config.yaml', 'r') as file:
        config = yaml.safe_load(file)

dir = config['fileio']['working_directory2']

print("dir: " + dir)

# Set the current working directory
os.chdir(dir)
print("Current working directory set to:", os.getcwd())

Current working directory set to: /home/ho-yu/bsg
dir: /home/ho-yu/bsg
Current working directory set to: /home/ho-yu/bsg


In [245]:
# READ in data
df_X_train = pd.read_csv('data/X_train.csv', header=None)
df_y_train = pd.read_csv('data/y_train.csv', header=None)

df_X_test = pd.read_csv('data/X_test.csv', header=None)
df_y_test = pd.read_csv('data/y_test.csv', header=None)

# save in numpy arrays
X_train = df_X_train.to_numpy()
y_train = df_y_train.to_numpy()
X_test = df_X_test.to_numpy()
y_test = df_y_test.to_numpy()

# get training set size
n_train = X_train.shape[0]

# normalize all features to [0,1] or [-1,1]
X_all = MyUtils.normalize_neg1_pos1(np.concatenate((X_train, X_test), axis=0))


X_train = X_all[:n_train]
X_test = X_all[n_train:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

print()
print(X_train)

(17, 3)
(17, 1)
(20, 3)
(20, 1)

[[-1.         -0.5505618  -0.9999943 ]
 [-1.          0.12359551 -0.97958466]
 [ 1.         -0.68539326 -0.99990489]
 [-1.         -0.23595506 -0.99998677]
 [ 1.          0.01123596  1.        ]
 [-1.         -0.21348315 -0.99999956]
 [ 1.         -0.68539326 -0.99999942]
 [ 1.         -0.68539326 -0.99999978]
 [-1.          0.3258427  -0.97096821]
 [-1.         -0.34831461 -0.99999999]
 [-1.         -0.50561798 -0.9999836 ]
 [-1.         -0.34831461 -0.99995775]
 [ 1.         -0.43820225 -0.99983335]
 [-1.         -0.46067416 -0.99997187]
 [ 1.         -0.73033708 -0.999998  ]
 [-1.         -0.43820225 -0.99997847]
 [-1.         -0.68539326 -0.99999845]]


In [246]:
log = logic.LogisticRegression()

In [247]:
log.fit(X_train, y_train, lam = 0.5, eta = 0.001, iterations = 50000, SGD = False, mini_batch_size = 20, degree = 1)

In [248]:
print('misclassfied percentage from training: ', log.error(X_train, y_train)/X_train.shape[0])
print('misclassfied percentage from validation: ', log.error(X_test, y_test)/X_test.shape[0])

misclassfied percentage from training:  0.29411764705882354
misclassfied percentage from validation:  0.2


In [251]:
# preds = log.predict(X_test)
print(log.degree)
sample = [X_train[1]]
print(sample)
err = log.error(sample, y_train[0])
print("err:", err)
print("predict:", log.predict(sample))

print("w: " + str(log.w))

1
[array([-1.        ,  0.12359551, -0.97958466])]
err: 0
signals [[-2.30563091]]
predict: [[0.09065768]]
w: [[-1.1010448 ]
 [ 0.44655989]
 [ 0.34529369]
 [ 0.81739027]]


In [250]:
conf = SparkConf().setAppName("appName").setMaster("local")

try:
    sc.stop()
except:
    pass

sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()


result_data = []
results = pd.DataFrame(columns=["i", "Status", "Predicted Probability", "Label"])

for i in range(y_test.shape[0]):
    misclassified = "             "
    if np.sign(preds[i]-0.5) != y_test[i]:
        misclassified = "MISCLASSIFIED"
    result_data.append([i, misclassified, preds[i][0], y_test[i][0]])
    # print("i: ", i, ", ", misclassified + ", predicted probablity of being +1 is: ", preds[i], ", label is", y_test[i])   

results = pd.DataFrame(result_data, columns=["i", "Status", "Predicted Probability of +1", "Label"])


bsg_people_train = spark.read.csv(dir + "/csv/people.csv", header=True, inferSchema=True) 
bsg_people_test = spark.read.csv(dir + "/data/bsg_people_data.csv", header=True, inferSchema=True)

results_spark_df = spark.createDataFrame(results)
results_full = results_spark_df.join(bsg_people_test, results_spark_df.i == bsg_people_test.id, "inner")
results_full = results_full.select(
    ["id",
     "first_name", 
     "last_name", 
     "Status", 
     "Predicted Probability of +1", 
     "Label", 
     "dob",
     "weight_lbs",
     "height_m",
     "gender",
     "dna_mutations_n" ,"is_cylon"
    ])

pd.set_option('display.max_rows', None)
results_pd = results_full.toPandas() #.sort_values("Status", ascending=False)
results_pd


25/02/16 20:38:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Unnamed: 0,id,first_name,last_name,Status,Predicted Probability of +1,Label,dob,weight_lbs,height_m,gender,dna_mutations_n,is_cylon
0,1,Christopher,Greer,,0.242288,-1,3677-04-22,123,1.58,F,140136,-1
1,2,John,Harris,,0.106439,-1,3662-11-11,115,1.62,F,628871,-1
2,3,Glenda,Wright,,0.115676,-1,3674-05-18,123,1.69,M,227274,-1
3,4,Jeanne,Trujillo,,0.081482,-1,3662-10-08,176,1.71,M,387437,-1
4,5,Audrey,Hunter,,0.179335,-1,3712-04-20,108,1.84,M,266538,-1
5,6,Gerald,Dunn,,0.220234,-1,3712-02-06,130,1.52,F,475191,-1
6,7,Stephen,Allen,MISCLASSIFIED,0.067064,1,3679-01-03,190,1.53,F,635813,-1
7,8,Joshua,Rodriguez,,0.111735,-1,3739-05-29,114,1.88,M,93,1
8,9,Brian,Martinez,,0.205771,-1,3667-12-20,248,1.85,M,287141,-1
9,10,Ruth,Hensley,,0.062369,-1,3689-12-25,138,1.64,F,325604,-1
