In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import os

from filter_data_script import filter_data

In [2]:
species_paths = {
    'American_Crow': './sp_count/American_Crow_2018.pkl',
    'American_Robin': './sp_count/American_Robin_2018.pkl',
    'Turkey_Vulture': './sp_count/Turkey_Vulture_2018.pkl',
    'Mallard': './sp_count/Mallard_2018.pkl',
    'Black-capped_Chickadee': './sp_count/Black-capped_Chickadee_2018.pkl',
    'House_Wren': './sp_count/House_Wren_2018.pkl'
}

checklist_data = pickle.load(open('American_snapshot_10_10_500_each_grid.pkl','rb'))
checklist_data = filter_data(checklist_data, None)

columns = [
    'duration_minutes',
    'Traveling',
    'Stationary',
    'Area',
    'obsvr_species_count',
    'DOY',
    'month',
    'week',
    'year',
    'time_observation_started_minute_of_day',
    'elevation_mean',
    'slope_mean',
    'eastness_mean',
    'northness_mean',
    'elevation_std',
    'slope_std',
    'eastness_std',
    'northness_std',
    'prec',
    'tmax',
    'tmin',
    'bio1',
    'bio2',
    'bio3',
    'bio4',
    'bio5',
    'bio6',
    'bio7',
    'bio8',
    'bio9',
    'bio10',
    'bio11',
    'bio12',
    'bio13',
    'bio14',
    'bio15',
    'bio16',
    'bio17',
    'bio18',
    'bio19',
    'closed_shrublands',
    'closed_shrublands_ed',
    'closed_shrublands_lpi',
    'closed_shrublands_pd',
    'cropland_or_natural_vegetation_mosaics',
    'cropland_or_natural_vegetation_mosaics_ed',
    'cropland_or_natural_vegetation_mosaics_lpi',
    'cropland_or_natural_vegetation_mosaics_pd',
    'croplands',
    'croplands_ed',
    'croplands_lpi',
    'croplands_pd',
    'deciduous_broadleaf_forests',
    'deciduous_broadleaf_forests_ed',
    'deciduous_broadleaf_forests_lpi',
    'deciduous_broadleaf_forests_pd',
    'deciduous_needleleaf_forests',
    'deciduous_needleleaf_forests_ed',
    'deciduous_needleleaf_forests_lpi',
    'deciduous_needleleaf_forests_pd',
    'evergreen_broadleaf_forests',
    'evergreen_broadleaf_forests_ed',
    'evergreen_broadleaf_forests_lpi',
    'evergreen_broadleaf_forests_pd',
    'evergreen_needleleaf_forests',
    'evergreen_needleleaf_forests_ed',
    'evergreen_needleleaf_forests_lpi',
    'evergreen_needleleaf_forests_pd',
    'grasslands',
    'grasslands_ed',
    'grasslands_lpi',
    'grasslands_pd',
    'mixed_forests',
    'mixed_forests_ed',
    'mixed_forests_lpi',
    'mixed_forests_pd',
    'non_vegetated_lands',
    'non_vegetated_lands_ed',
    'non_vegetated_lands_lpi',
    'non_vegetated_lands_pd',
    'open_shrublands',
    'open_shrublands_ed',
    'open_shrublands_lpi',
    'open_shrublands_pd',
    'permanent_wetlands',
    'permanent_wetlands_ed',
    'permanent_wetlands_lpi',
    'permanent_wetlands_pd',
    'savannas',
    'savannas_ed',
    'savannas_lpi',
    'savannas_pd',
    'urban_and_built_up_lands',
    'urban_and_built_up_lands_ed',
    'urban_and_built_up_lands_lpi',
    'urban_and_built_up_lands_pd',
    'water_bodies',
    'water_bodies_ed',
    'water_bodies_lpi',
    'water_bodies_pd',
    'woody_savannas',
    'woody_savannas_ed',
    'woody_savannas_lpi',
    'woody_savannas_pd',
    'entropy'
]

  qresult_copy['week'] = qresult_copy['observation_date'].dt.week


In [3]:
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import recall_score, precision_score, average_precision_score, roc_auc_score, f1_score, cohen_kappa_score

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.metrics import make_scorer


results = {}
for species, species_path in species_paths.items():
    sp_data = pickle.load(open(species_path,'rb'))
    data = pd.merge(checklist_data, sp_data, on='sampling_event_identifier', how='left')
    data['count'][data['count'] > 0] = 1
    data['count'] = data['count'].fillna(0)
    
    X = data[[i for i in data.columns if i in columns]]
    y = data['count']
    
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    """
    
    models = [RandomForestClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), LogisticRegression(), XGBClassifier()]
    model_results = {}
    for model in models:
        clf = make_pipeline(StandardScaler(), model)
        
        cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
        scores = cross_validate(
            clf,
            X,
            y,
            cv=cv,
            scoring={
                'recall': make_scorer(recall_score),
                'precision': make_scorer(precision_score),
                'ap': make_scorer(average_precision_score),
                'roc_auc': make_scorer(roc_auc_score),
                'f1': make_scorer(f1_score),
                'cohen_kappa': make_scorer(cohen_kappa_score)
            }
        )

        #model = model.fit(X_train, y_train)
        #preds = model.predict(X_test)

        model_results[model.__class__.__name__] = {
            'recall': {'mean': scores['test_recall'].mean(), 'std': scores['test_recall'].std(), 'n': len(scores['test_recall'])},
            'precision': {'mean': scores['test_precision'].mean(), 'std': scores['test_precision'].std(), 'n': len(scores['test_precision'])},
            'ap': {'mean': scores['test_ap'].mean(), 'std': scores['test_ap'].std(), 'n': len(scores['test_ap'])},
            'roc_auc': {'mean': scores['test_roc_auc'].mean(), 'std': scores['test_roc_auc'].std(), 'n': len(scores['test_roc_auc'])},
            'f1': {'mean': scores['test_f1'].mean(), 'std': scores['test_f1'].std(), 'n': len(scores['test_f1'])},
            'cohen_kappa': {'mean': scores['test_cohen_kappa'].mean(), 'std': scores['test_cohen_kappa'].std(), 'n': len(scores['test_cohen_kappa'])}
        }
        print(f'{species} {model.__class__.__name__}')
    results[species] = model_results

    with open('classifier_results.json', 'w') as fp:
        json.dump(results, fp, indent=2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['count'][data['count'] > 0] = 1


American_Crow RandomForestClassifier
American_Crow GradientBoostingClassifier
American_Crow DecisionTreeClassifier


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

American_Crow LogisticRegression
American_Crow XGBClassifier


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['count'][data['count'] > 0] = 1


American_Robin RandomForestClassifier
American_Robin GradientBoostingClassifier
American_Robin DecisionTreeClassifier


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

American_Robin LogisticRegression
American_Robin XGBClassifier


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['count'][data['count'] > 0] = 1


Turkey_Vulture RandomForestClassifier
Turkey_Vulture GradientBoostingClassifier
Turkey_Vulture DecisionTreeClassifier


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Turkey_Vulture LogisticRegression
Turkey_Vulture XGBClassifier


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['count'][data['count'] > 0] = 1


Mallard RandomForestClassifier
Mallard GradientBoostingClassifier
Mallard DecisionTreeClassifier


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mallard LogisticRegression
Mallard XGBClassifier


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['count'][data['count'] > 0] = 1


Black-capped_Chickadee RandomForestClassifier
Black-capped_Chickadee GradientBoostingClassifier
Black-capped_Chickadee DecisionTreeClassifier


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Black-capped_Chickadee LogisticRegression
Black-capped_Chickadee XGBClassifier


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['count'][data['count'] > 0] = 1


House_Wren RandomForestClassifier
House_Wren GradientBoostingClassifier
House_Wren DecisionTreeClassifier


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

House_Wren LogisticRegression
House_Wren XGBClassifier


        duration_minutes  Traveling  Stationary  Area  effort_distance_km  \
84288              420.0          0           1     0                -1.0   
311933              34.0          0           1     0                -1.0   
485273             120.0          0           1     0                -1.0   
296357             105.0          1           0     0                 4.0   
47213                5.0          0           1     0                -1.0   
...                  ...        ...         ...   ...                 ...   
259178               5.0          0           1     0                -1.0   
365838              35.0          0           1     0                -1.0   
131932             101.0          0           1     0                -1.0   
146867              10.0          0           1     0                -1.0   
121958              10.0          0           1     0                -1.0   

        number_observers  elevation_mean  slope_mean  eastness_mean  \
8428

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'RandomForestClassifier': {'recall': 0.6502959877219908,
  'precision': 0.726115429340841,
  'ap': 0.5376533759690357,
  'roc_auc': 0.7969020808394717},
 'GradientBoostingClassifier': {'recall': 0.387360228020171,
  'precision': 0.6595426971535231,
  'ap': 0.37016473313859943,
  'roc_auc': 0.6706541719536567},
 'DecisionTreeClassifier': {'recall': 0.6266169699627274,
  'precision': 0.6132718201813208,
  'ap': 0.4541825885810646,
  'roc_auc': 0.7678057392892599},
 'LogisticRegression': {'recall': 0.24386099539574654,
  'precision': 0.5239665528206336,
  'ap': 0.2693217068148591,
  'roc_auc': 0.5964176529083762}}