In [12]:
!pip install scikit-learn




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import time
start = time.time()
# import warnings, requests, zipfile, io
# warnings.simplefilter('ignore')
import pandas as pd
# from scipy.io import arff

import os
# import boto3
# import sagemaker
# from sagemaker.image_uris import retrieve
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
%%time

def plot_roc(test_labels, target_predicted_binary):
    TN, FP, FN, TP = confusion_matrix(test_labels, target_predicted_binary).ravel()
    # Sensitivity, hit rate, recall, or true positive rate
    Sensitivity  = float(TP)/(TP+FN)*100
    # Specificity or true negative rate
    Specificity  = float(TN)/(TN+FP)*100
    # Precision or positive predictive value
    Precision = float(TP)/(TP+FP)*100
    # Negative predictive value
    NPV = float(TN)/(TN+FN)*100
    # Fall out or false positive rate
    FPR = float(FP)/(FP+TN)*100
    # False negative rate
    FNR = float(FN)/(TP+FN)*100
    # False discovery rate
    FDR = float(FP)/(TP+FP)*100
    # Overall accuracy
    ACC = float(TP+TN)/(TP+FP+FN+TN)*100

    print(f"Sensitivity or TPR: {Sensitivity}%")    
    print(f"Specificity or TNR: {Specificity}%") 
    print(f"Precision: {Precision}%")   
    print(f"Negative Predictive Value: {NPV}%")  
    print( f"False Positive Rate: {FPR}%") 
    print(f"False Negative Rate: {FNR}%")  
    print(f"False Discovery Rate: {FDR}%" )
    print(f"Accuracy: {ACC}%") 

    test_labels = test.iloc[:,0];
    print("Validation AUC", roc_auc_score(test_labels, target_predicted_binary) )

    fpr, tpr, thresholds = roc_curve(test_labels, target_predicted_binary)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")

    # create the axis of thresholds (scores)
    ax2 = plt.gca().twinx()
    ax2.plot(fpr, thresholds, markeredgecolor='r',linestyle='dashed', color='r')
    ax2.set_ylabel('Threshold',color='r')
    ax2.set_ylim([thresholds[-1],thresholds[0]])
    ax2.set_xlim([fpr[0],fpr[-1]])

    print(plt.figure())

def plot_confusion_matrix(test_labels, target_predicted):
    matrix = confusion_matrix(test_labels, target_predicted)
    df_confusion = pd.DataFrame(matrix)
    colormap = sns.color_palette("BrBG", 10)
    sns.heatmap(df_confusion, annot=True, fmt='.2f', cbar=None, cmap=colormap)
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.ylabel("True Class")
    plt.xlabel("Predicted Class")
    plt.show()
    

CPU times: total: 0 ns
Wall time: 0 ns


In [15]:
df = crop_df = pd.read_csv('cropStats.csv',delimiter=',')
df


Unnamed: 0,Country,Location,1900,1901,1902,1903,1904,1905,1906,1907,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,Total_Hectares
0,United States,ALABAMA,1064308.4,1044074.4,1084542.4,1088589.2,1052168.0,1072402.0,1048121.2,1011700.0,...,101170.0,101170.0,101170.0,119380.6,119380.6,115333.8,99146.6,127474.2,95099.8,78248520.0
1,United States,ARIZONA,4451.48,4856.16,4046.8,4856.16,4046.8,5260.84,5260.84,6070.2,...,8093.6,8902.96,14163.8,12949.76,20638.68,11331.04,14568.48,20234.0,12949.76,1082924.0
2,United States,ARKANSAS,922670.4,886249.2,906483.2,870062.0,837687.6,821500.4,821500.4,849828.0,...,165918.8,153778.4,210433.6,281252.6,352071.6,214480.4,180082.6,301486.6,240784.6,48255660.0
3,United States,CALIFORNIA,25090.16,25494.84,25899.52,25090.16,23876.12,24685.48,25090.16,24280.8,...,64748.8,72842.4,60702.0,72842.4,72842.4,38444.6,24280.8,40468.0,32374.4,6045919.0
4,United States,COLORADO,38039.92,44110.12,53013.08,57869.24,66367.52,72842.4,78912.6,84982.8,...,400633.2,489662.8,526084.0,408726.8,396586.4,408726.8,384446.0,473475.6,526084.0,32483260.0
5,United States,DELAWARE,78912.6,78103.24,77698.56,80126.64,78912.6,76889.2,76889.2,76889.2,...,65962.84,70009.64,73651.76,72033.04,70414.32,67986.24,66367.52,66367.52,69200.28,7686087.0
6,United States,FLORIDA,232691.0,230667.6,238761.2,240784.6,240784.6,248878.2,246854.8,240784.6,...,14973.16,10117.0,13354.44,16187.2,31565.04,16187.2,20234.0,16187.2,14973.16,18142210.0
7,United States,GEORGIA,1444707.6,1388052.4,1388052.4,1363771.6,1339490.8,1408286.4,1379958.8,1367818.4,...,149731.6,99146.6,109263.6,125450.8,188176.2,125450.8,115333.8,137591.2,99146.6,103387200.0
8,United States,IDAHO,4046.8,4046.8,3642.12,4046.8,3237.44,3642.12,3642.12,4046.8,...,32374.4,44514.8,48561.6,54631.8,46538.2,32374.4,28327.6,40468.0,46538.2,1780997.0
9,United States,ILLINOIS,4232952.8,4330076.0,4390778.0,4269374.0,4249140.0,4249140.0,4249140.0,4208672.0,...,4775224.0,5018032.0,4997798.0,4957330.0,4775224.0,4754990.0,4653820.0,4633586.0,4431246.0,465592400.0


we only need the location and the year so I will remove the country column and the total


In [16]:

# Dropping the first and last columns
df = crop_df.iloc[:, 1:-1]  # Selects all rows, and columns from index 1 to the second-to-last column
df

Unnamed: 0,Location,1900,1901,1902,1903,1904,1905,1906,1907,1908,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,ALABAMA,1064308.4,1044074.4,1084542.4,1088589.2,1052168.0,1072402.0,1048121.2,1011700.0,1027887.2,...,95099.8,101170.0,101170.0,101170.0,119380.6,119380.6,115333.8,99146.6,127474.2,95099.8
1,ARIZONA,4451.48,4856.16,4046.8,4856.16,4046.8,5260.84,5260.84,6070.2,6474.88,...,6070.2,8093.6,8902.96,14163.8,12949.76,20638.68,11331.04,14568.48,20234.0,12949.76
2,ARKANSAS,922670.4,886249.2,906483.2,870062.0,837687.6,821500.4,821500.4,849828.0,866015.2,...,174012.4,165918.8,153778.4,210433.6,281252.6,352071.6,214480.4,180082.6,301486.6,240784.6
3,CALIFORNIA,25090.16,25494.84,25899.52,25090.16,23876.12,24685.48,25090.16,24280.8,23471.44,...,68795.6,64748.8,72842.4,60702.0,72842.4,72842.4,38444.6,24280.8,40468.0,32374.4
4,COLORADO,38039.92,44110.12,53013.08,57869.24,66367.52,72842.4,78912.6,84982.8,105216.8,...,408726.8,400633.2,489662.8,526084.0,408726.8,396586.4,408726.8,384446.0,473475.6,526084.0
5,DELAWARE,78912.6,78103.24,77698.56,80126.64,78912.6,76889.2,76889.2,76889.2,76889.2,...,61511.36,65962.84,70009.64,73651.76,72033.04,70414.32,67986.24,66367.52,66367.52,69200.28
6,FLORIDA,232691.0,230667.6,238761.2,240784.6,240784.6,248878.2,246854.8,240784.6,234714.4,...,14163.8,14973.16,10117.0,13354.44,16187.2,31565.04,16187.2,20234.0,16187.2,14973.16
7,GEORGIA,1444707.6,1388052.4,1388052.4,1363771.6,1339490.8,1408286.4,1379958.8,1367818.4,1355678.0,...,125450.8,149731.6,99146.6,109263.6,125450.8,188176.2,125450.8,115333.8,137591.2,99146.6
8,IDAHO,4046.8,4046.8,3642.12,4046.8,3237.44,3642.12,3642.12,4046.8,4046.8,...,32374.4,32374.4,44514.8,48561.6,54631.8,46538.2,32374.4,28327.6,40468.0,46538.2
9,ILLINOIS,4232952.8,4330076.0,4390778.0,4269374.0,4249140.0,4249140.0,4249140.0,4208672.0,3965864.0,...,4815692.0,4775224.0,5018032.0,4997798.0,4957330.0,4775224.0,4754990.0,4653820.0,4633586.0,4431246.0


In [17]:
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df

Unnamed: 0,2017,Location,1900,1901,1902,1903,1904,1905,1906,1907,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,95099.8,ALABAMA,1064308.4,1044074.4,1084542.4,1088589.2,1052168.0,1072402.0,1048121.2,1011700.0,...,113310.4,95099.8,101170.0,101170.0,101170.0,119380.6,119380.6,115333.8,99146.6,127474.2
1,12949.76,ARIZONA,4451.48,4856.16,4046.8,4856.16,4046.8,5260.84,5260.84,6070.2,...,8902.96,6070.2,8093.6,8902.96,14163.8,12949.76,20638.68,11331.04,14568.48,20234.0
2,240784.6,ARKANSAS,922670.4,886249.2,906483.2,870062.0,837687.6,821500.4,821500.4,849828.0,...,238761.2,174012.4,165918.8,153778.4,210433.6,281252.6,352071.6,214480.4,180082.6,301486.6
3,32374.4,CALIFORNIA,25090.16,25494.84,25899.52,25090.16,23876.12,24685.48,25090.16,24280.8,...,76889.2,68795.6,64748.8,72842.4,60702.0,72842.4,72842.4,38444.6,24280.8,40468.0
4,526084.0,COLORADO,38039.92,44110.12,53013.08,57869.24,66367.52,72842.4,78912.6,84982.8,...,428960.8,408726.8,400633.2,489662.8,526084.0,408726.8,396586.4,408726.8,384446.0,473475.6
5,69200.28,DELAWARE,78912.6,78103.24,77698.56,80126.64,78912.6,76889.2,76889.2,76889.2,...,74865.8,61511.36,65962.84,70009.64,73651.76,72033.04,70414.32,67986.24,66367.52,66367.52
6,14973.16,FLORIDA,232691.0,230667.6,238761.2,240784.6,240784.6,248878.2,246854.8,240784.6,...,14163.8,14163.8,14973.16,10117.0,13354.44,16187.2,31565.04,16187.2,20234.0,16187.2
7,99146.6,GEORGIA,1444707.6,1388052.4,1388052.4,1363771.6,1339490.8,1408286.4,1379958.8,1367818.4,...,182106.0,125450.8,149731.6,99146.6,109263.6,125450.8,188176.2,125450.8,115333.8,137591.2
8,46538.2,IDAHO,4046.8,4046.8,3642.12,4046.8,3237.44,3642.12,3642.12,4046.8,...,42491.4,32374.4,32374.4,44514.8,48561.6,54631.8,46538.2,32374.4,28327.6,40468.0
9,4431246.0,ILLINOIS,4232952.8,4330076.0,4390778.0,4269374.0,4249140.0,4249140.0,4249140.0,4208672.0,...,5281074.0,4815692.0,4775224.0,5018032.0,4997798.0,4957330.0,4775224.0,4754990.0,4653820.0,4633586.0


In [23]:
train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42)
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42)

prefix='lab3'

train_file='vertebral_train.csv'
test_file='vertebral_test.csv'
validate_file='vertebral_validate.csv'

In [None]:
hyperparams={"num_round":"42",
             "eval_metric": "auc",
             "objective": "binary:logistic",
             "silent" : 1}
