This notebook trains a logistic regression model, finds the optimal value of C, and reports F1 and log loss scores.

In [1]:
import numpy as np
import pandas as pd
import zipfile
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [None]:
# Unzip data files into the "csv" subdirectory 
# (unless you have already done this since running the Data Set Up notebook)

# **IMPORTANT**  This will overwrite existing files in the "csv" folder in your local repo
# with the most recent data files from the data.zip file

# Unzip 80% training data
unzip_training_data = zipfile.ZipFile("data_subset.zip", "r")
unzip_training_data.extractall()
unzip_training_data.close()

# Unzip development and training data
unzip_test_data = zipfile.ZipFile("testing.zip", "r")
unzip_test_data.extractall()
unzip_test_data.close()

# Unzip full set of training data for creating predictions to submit to Kaggle
unzip_all_data = zipfile.ZipFile("data.zip", "r")
unzip_all_data.extractall()
unzip_all_data.close()

In [2]:
# Load these csv files into numpy arrays for testing on development data
train_data = np.loadtxt('csv/train_data.csv', delimiter=",")
train_labels = np.loadtxt('csv/train_labels.csv', dtype=str, delimiter=",")
dev_data = np.loadtxt('csv/dev_data.csv', delimiter=",")
dev_labels = np.loadtxt('csv/dev_labels.csv', dtype=str, delimiter=",")

In [7]:
# Load these csv files into numpy arrays for creating predictions to submit to Kaggle
train_data_all = np.loadtxt('csv/train_data_all.csv', delimiter=",")
train_labels_all = np.loadtxt('csv/train_labels_all.csv', dtype=str, delimiter=",")
test_data_all = np.loadtxt('csv/test_data_all.csv', delimiter=",")

In [3]:
# print shapes to compare before and after csv conversion
print("train_data shape is", train_data.shape)
print("train_labels shape is", train_labels.shape)
print("dev_data shape is", dev_data.shape)
print("dev_labels shape is", dev_labels.shape)

train_data shape is (702439, 58)
train_labels shape is (702439,)
dev_data shape is (175610, 58)
dev_labels shape is (175610,)


In [8]:
print("train_data_all shape is", train_data_all.shape)
print("train_labels_all shape is", train_labels_all.shape)
print("test_data_all shape is", test_data_all.shape)

train_data_all shape is (878049, 58)
train_labels_all shape is (878049,)
test_data_all shape is (884262, 58)


In [4]:
# Set up functions for training logistic regression model and finding optimal value of C

def TrainLR(data, labels, test_data, C_value=1.0):
    """This function takes in training data and labels, testing data,
    and can accept different values of C (the learning rate).
    It trains a logistic regression model and returns the model and predicted probabilities.
    """
    LR = LogisticRegression(C=C_value)
    LR.fit(data, labels)
    pp = LR.predict_proba(test_data)
    return LR, pp

def find_C(data, labels, dev_data, dev_labels, C_values):
    """Find optimal value of C in a logistic regression model.  
    
    Note that this cannot be used on test data from Kaggle 
    because we do not have labels for that data.  This function is intended to only be used
    in the development stage with the development data.
    """
    for C in C_values:      
        LR, pp = TrainLR(data, labels, dev_data, C)
        predictions = LR.predict(dev_data)
        f1 = metrics.f1_score(dev_labels, predictions, average = "weighted")
        logloss = metrics.log_loss(dev_labels, pp)
        
        # Print F1 score and log loss for each value of k
        print("For C =", C, "the F1 score is", round(f1, 6), "and the Log Loss score is", round(logloss, 6))
    print("\n")

In [None]:
# IF there are additional changes to make to the data for this model
# that would be easier to do in pandas, uncomment and run this code. 
# This model works the same whether the data is in numpy or pandas, so presumably so do other models

#train_data = pd.DataFrame(train_data)
#train_labels = pd.DataFrame(train_labels)
#dev_data = pd.DataFrame(dev_data)
#dev_labels = pd.DataFrame(dev_labels)
#train_data_all = pd.DataFrame(train_data_all)
#train_labels_all = pd.DataFrame(train_labels_all)
#test_data_all = pd.DataFrame(test_data_all)

In [5]:
# Find the optimal value of C using the 80% training data and the development data
C_values = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 100.00, 1000.0]
find_C(train_data, train_labels, dev_data, dev_labels, C_values)

  'precision', 'predicted', average, warn_for)


For C = 0.0001 the F1 score is 0.147075 and the Log Loss score is 3.014795
For C = 0.001 the F1 score is 0.150284 and the Log Loss score is 2.638404
For C = 0.01 the F1 score is 0.151366 and the Log Loss score is 2.551881
For C = 0.1 the F1 score is 0.151589 and the Log Loss score is 2.543797
For C = 0.5 the F1 score is 0.151615 and the Log Loss score is 2.543427
For C = 1.0 the F1 score is 0.151579 and the Log Loss score is 2.543396
For C = 2.0 the F1 score is 0.151605 and the Log Loss score is 2.543383
For C = 10.0 the F1 score is 0.151657 and the Log Loss score is 2.543385
For C = 100.0 the F1 score is 0.151619 and the Log Loss score is 2.543447
For C = 1000.0 the F1 score is 0.151616 and the Log Loss score is 2.543544




In [4]:
# Train model with a single value of C with 80% training data and development data
C_value = 0.5
LR, pp = TrainLR(train_data, train_labels, dev_data, C_value)
logloss = metrics.log_loss(dev_labels, pp)
print(logloss)

2.54342653545


In [9]:
# Before submitting to Kaggle, run the model on the full set of training data and test data
# using the optimal value for the model
C_value = 2.0
LR, pp = TrainLR(train_data_all, train_labels_all, test_data_all, C_value)

  np.exp(prob, prob)


In [10]:
# Set up predictions for submission to Kaggle
headers = ["ARSON","ASSAULT","BAD CHECKS","BRIBERY","BURGLARY","DISORDERLY CONDUCT","DRIVING UNDER THE INFLUENCE",
           "DRUG/NARCOTIC","DRUNKENNESS","EMBEZZLEMENT","EXTORTION","FAMILY OFFENSES","FORGERY/COUNTERFEITING",
           "FRAUD","GAMBLING","KIDNAPPING","LARCENY/THEFT","LIQUOR LAWS","LOITERING","MISSING PERSON","NON-CRIMINAL",
           "OTHER OFFENSES","PORNOGRAPHY/OBSCENE MAT","PROSTITUTION","RECOVERED VEHICLE","ROBBERY","RUNAWAY",
           "SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","STOLEN PROPERTY","SUICIDE",
           "SUSPICIOUS OCC","TREA","TRESPASS","VANDALISM","VEHICLE THEFT","WARRANTS","WEAPON LAWS"]
data = pd.DataFrame(data=pp, 
                    index=[x for x in range(len(test_data_all))], 
                    columns=headers)
data.columns.name ="Id"
print(data)

Id              ARSON        ASSAULT     BAD CHECKS        BRIBERY  \
0        0.000000e+00  6.794425e-103   0.000000e+00   7.429346e-84   
1        0.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00   
2        0.000000e+00   2.694877e-97   0.000000e+00   9.160959e-79   
3        0.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00   
4        0.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00   
5        0.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00   
6       3.647829e-281   7.142857e-02   0.000000e+00   7.142857e-02   
7        0.000000e+00   1.953106e-40   0.000000e+00   2.904655e-26   
8        0.000000e+00  5.157686e-235   0.000000e+00  1.214094e-205   
9        0.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00   
10       0.000000e+00   7.142857e-02   0.000000e+00   7.142857e-02   
11       0.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00   
12       0.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00   
13       0.000000e+0

Create zipped csv file for Kaggle
#### Update the filename first in all lines of the following code
Add something unique after our names to avoid overwriting other submission files

In [12]:
data.to_csv('Williams_Gascoigne_Vignola_Regression1.csv', index_label = "Id")

In [13]:
zip_probs = zipfile.ZipFile("Williams_Gascoigne_Vignola_Regression1.zip", "w")
zip_probs.write("Williams_Gascoigne_Vignola_Regression1.csv", compress_type=zipfile.ZIP_DEFLATED)
zip_probs.close()

### Results from previous datasets and/or model parameters

Results from dataset as of Saturday 11/18, with weather added, latitude outliers removed, binarized and normalized features:

For C = 0.0001 the F1 score is 0.147075 and the Log Loss score is 3.014795  
For C = 0.001 the F1 score is 0.150284 and the Log Loss score is 2.638404  
For C = 0.01 the F1 score is 0.151366 and the Log Loss score is 2.551881  
For C = 0.1 the F1 score is 0.151589 and the Log Loss score is 2.543797  
For C = 0.5 the F1 score is 0.151615 and the Log Loss score is 2.543427  
For C = 1.0 the F1 score is 0.151579 and the Log Loss score is 2.543396  
**For C = 2.0 the F1 score is 0.151605 and the Log Loss score is 2.543383**  
For C = 10.0 the F1 score is 0.151657 and the Log Loss score is 2.543385  
For C = 100.0 the F1 score is 0.151619 and the Log Loss score is 2.543447  
For C = 1000.0 the F1 score is 0.151616 and the Log Loss score is 2.543544  
