# Plan:

### Phase 2 will utilize the subset of women that were identified as abused in the study and compound extra features that were gleaned in a follow-up survey of these identified women.  These features will be narrowed and modeled to identify risk of being reassaulted.  This work will investigate appropriately on a compounded dataframe of our original features in addition to the new features from the follow-up survey.

### Hypothesis: From domain knowledge of this field, we predict that there will likely be significant identifiers of abuse that can be fed into a machine-learned model.

# ENVIRONMENT

In [1]:
# basic environment and custom scripts
import os
import acquire
import prepare
import explore
import time

# numpy and pandas for dataframe building and manipulation
import pandas as pd
import numpy as np


# matplotlib and seaborn for visualization
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
%matplotlib inline
import seaborn as sns

# import preprocessing for scaling and splitting
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# xgboost for feature selection
import xgboost as xgb

# sklearn machine learning
from sklearn.linear_model import SGDClassifier
# classification reports
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# SVM
from sklearn.svm import SVC
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# Dimensionality
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# visualization for tree model:
import graphviz
from io import StringIO
from IPython.display import Image 
import pydotplus

#  balanced bagging classifier
from imblearn.ensemble import BalancedBaggingClassifier

# to explode the DataFrames and avoid truncation
# pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

# ACQUISITION

In [2]:
# utilize acquire script to get primary dataframe for analysis
dfa, dfb = acquire.get_data()

# PREPARATION

## We will push forward and prepare dfa as we did in Phase 1.  Details and checks may be examined in the Phase 1 notebook

In [3]:
# create recidivism column
dfa['RECID'] = dfa.CASEID.apply(prepare.get_repeat_case)
# use prepare function to rename features 
dfa = prepare.rename_columns_all(dfa)
# use prepare function to re-encode data
prepare.replace_nonvals_all(dfa)

In [4]:
#peek at dfb
dfb.head()

Unnamed: 0,CASEID,M5FIRED,M11HIGH,M35SAFE,M41ILLGL,M42DAGRR,M13TALKR,M32OTHER,M27HOW,M30ARRES,...,FORCEDR,MISCARR,RESTRAIN,CHOKED,NDRUNK,RDRUNK,BOTHDRUN,NDRUGS,RDRUGS,BOTHDRUG
0,108395,9999,2,1,2,0,1,99,99999,3,...,0,0,0,0,0,0,0,0,0,0
1,108397,9999,2,1,2,0,1,99,99999,3,...,0,0,0,1,2,0,0,0,0,0
2,108399,9999,2,1,2,0,1,99,99999,3,...,0,0,1,1,0,0,0,0,0,0
3,108443,9999,2,1,2,0,1,13,99999,3,...,0,0,0,1,1,0,0,0,0,0
4,108444,9999,1,1,2,0,1,99,99999,3,...,0,0,0,0,3,0,0,0,0,0


In [5]:
# formally check that there are no nulls in dataframe b
prepare.get_nulls_by_column(dfb)

Empty DataFrame
Columns: [sum_nulls, nulls_by_percent]
Index: []


In [6]:
# check no 2
if not prepare.get_nulls_by_row(dfb):
    print('no nulls!')
else:
    prepare.get_nulls_by_row(dfb)

no nulls!


In [7]:
prepare.summarize_data(dfb)

HEAD
   CASEID  M5FIRED  M11HIGH  M35SAFE  M41ILLGL  M42DAGRR  M13TALKR  M32OTHER  \
0  108395     9999        2        1         2         0         1        99   
1  108397     9999        2        1         2         0         1        99   
2  108399     9999        2        1         2         0         1        99   
3  108443     9999        2        1         2         0         1        13   
4  108444     9999        1        1         2         0         1        99   

   M27HOW  M30ARRES    ...     FORCEDR  MISCARR  RESTRAIN  CHOKED  NDRUNK  \
0   99999         3    ...           0        0         0       0       0   
1   99999         3    ...           0        0         0       1       2   
2   99999         3    ...           0        0         1       1       0   
3   99999         3    ...           0        0         0       1       1   
4   99999         3    ...           0        0         0       0       3   

   RDRUNK  BOTHDRUN  NDRUGS  RDRUGS  BOTHDRUG  
0  

## We can see the cases where reassault happened. There are 350 cases out of the original 705 (before the padding implemented in Phase 1). There were 497 cases of domestic violence (cases where the surveyed was identified as abused).

In [8]:
# make new dataframe out of subset of dfa where we only look at the victims of abuse
dfa_abused = dfa[dfa.abuse_past_year == 1]

In [9]:
# cursory glance at dfb
dfb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 29 columns):
CASEID      497 non-null int64
M5FIRED     497 non-null int64
M11HIGH     497 non-null int64
M35SAFE     497 non-null int64
M41ILLGL    497 non-null int64
M42DAGRR    497 non-null int64
M13TALKR    497 non-null int64
M32OTHER    497 non-null int64
M27HOW      497 non-null int64
M30ARRES    497 non-null int64
M31HOW      497 non-null int64
M38ORDER    497 non-null int64
SEVERER     497 non-null int64
TOTINCR     497 non-null int64
THREATR     497 non-null int64
SLAPR       497 non-null int64
PUNCHR      497 non-null int64
BEATR       497 non-null int64
UWEAPON     497 non-null int64
FORCEDR     497 non-null int64
MISCARR     497 non-null int64
RESTRAIN    497 non-null int64
CHOKED      497 non-null int64
NDRUNK      497 non-null int64
RDRUNK      497 non-null int64
BOTHDRUN    497 non-null int64
NDRUGS      497 non-null int64
RDRUGS      497 non-null int64
BOTHDRUG    497 non-null in

In [10]:
# re-name columns in dfb into more readable features
dfb = prepare.rename_columns_recid(dfb)

In [11]:
# re-encode / clean values from dfb dataframe
prepare.replace_nonvals_recid(dfb)

In [12]:
# merge our dfb data onto our original phase one data set
df = dfa_abused.merge(right=dfb, on='id')

In [13]:
# isolate single abuse cases into 
single_abuse = df[df.reassault == 0]

In [14]:
# create new dataframe of just single abuse cases, reassign them new range of case ids
single_abuse['id'] = range(999999, 999999 + len(single_abuse))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [15]:
#append the padded single assault values to our greater dataframe
df = df.append(single_abuse)

In [16]:
# formally check that there are no nulls in dataframe
prepare.get_nulls_by_column(df)

Empty DataFrame
Columns: [sum_nulls, nulls_by_percent]
Index: []


In [17]:
# check no 2
if not prepare.get_nulls_by_row(df):
    print('no nulls!')
else:
    prepare.get_nulls_by_row(df)

no nulls!


In [18]:
df.shape

(644, 73)

In [19]:
df.reassault.value_counts()

1    350
0    294
Name: reassault, dtype: int64

In [20]:
# create list of features
features = [col for col in df]
# drop features from this list that are perceived as primary key or target variable
features.remove('id')
features.remove('abuse_past_year')
features.remove('abuse_status')
features.remove('reassault')

In [21]:
features

['length_relationship',
 'partner_abusive',
 'num_abusers',
 'num_children',
 'pregnant',
 'beaten_while_pregnant',
 'support_score',
 'guns_in_home',
 'jealous_past_year',
 'limit_family_contact',
 'location_tracking',
 'threat_hit',
 'threat_object',
 'push_shove',
 'slap',
 'kick_punch',
 'hit_object',
 'beaten',
 'choked',
 'threat_knife',
 'threat_gun',
 'rape_with_threat',
 'power_scale',
 'harass_scale',
 'id_age',
 'age_disparity',
 'children_not_partner',
 'same_sex_relationship',
 'partner_drug_use',
 'partner_alcohol_use',
 'threat_suicide',
 'partner_reported_child_abuse',
 'partner_arrested',
 'violence_increased',
 'severity_increased',
 'weapon_ever',
 'choked_ever',
 'rape_ever',
 'controlled_ever',
 'jealous',
 'capable_murder',
 'gun_fired',
 'anyone_high',
 'safe_place',
 'forced_illegal',
 'life_danger',
 'talk_about_it',
 'left_or_not',
 'medical_staff_helpful',
 'perp_arrested_ever',
 'police_resp',
 'order_protection',
 'level_severity',
 'num_incidents',
 'num_t

In [22]:
df.to_csv('phase2_df.csv')

In [23]:
# split train and test
X = df[features]
y = df[['reassault']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .15, random_state = 0, stratify=y)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# EXPLORATION

## We will run chi-squared tests on our categorical variables in order to get a grasp of importance

In [24]:
# we will run chi-squared tests on our categorical variables in order to get a grasp of importance
sig_feats, sig_dict = explore.get_chi_squared(train_df, features, 'reassault')

length_relationship
Dependent (reject H0)
-----------------------
num_abusers
Dependent (reject H0)
-----------------------
beaten_while_pregnant
Dependent (reject H0)
-----------------------
support_score
Dependent (reject H0)
-----------------------
jealous_past_year
Dependent (reject H0)
-----------------------
limit_family_contact
Dependent (reject H0)
-----------------------
location_tracking
Dependent (reject H0)
-----------------------
threat_hit
Dependent (reject H0)
-----------------------
threat_object
Dependent (reject H0)
-----------------------
push_shove
Dependent (reject H0)
-----------------------
slap
Dependent (reject H0)
-----------------------
kick_punch
Dependent (reject H0)
-----------------------
hit_object
Dependent (reject H0)
-----------------------
beaten
Dependent (reject H0)
-----------------------
choked
Dependent (reject H0)
-----------------------
threat_knife
Dependent (reject H0)
-----------------------
threat_gun
Dependent (reject H0)
----------------

In [25]:
# scale continuous variables
scaler = MinMaxScaler()

scaler.fit(train_df[['id_age']])

train_df[['id_age']] = scaler.transform(train_df[['id_age']])
test_df[['id_age']] = scaler.transform(test_df[['id_age']])

  return self.partial_fit(X, y)


In [26]:
continuous_vars = ['id_age']

In [27]:
sig_feats_c, sig_dict_c = explore.get_significant_t_tests(train_df, ['id_age'], 'reassault')

In [28]:
# call function that combines significant features from dictionaries created in statistical tests if both generated results
# otherwise make list of features just from whichever (t test or chi squared) produced significant results
if sig_feats_c and sig_feats:
    features = explore.combine_significants(sig_dict, sig_dict_c)
elif sig_feats_c:
    features = [item[0] for item in explore.sort_sigs(sig_dict_c)]
elif sig_feats:
    features = [item[0] for item in explore.sort_sigs(sig_dict)]

In [29]:
# explore.make_bars(train_df, 'reassault', features)

In [30]:
# explore.swarrrm(train_df, 'reassault', ['id_age'])

In [31]:
# for feature in continuous_vars:
#    sns.lineplot(x=feature, y='reassault', data=train_df)
#    plt.show()

In [32]:
# histograms
# normalization(?)
# explore.plot_hist(train_df)

In [33]:
# explore.make_rel(train_df, 'id_age', 'age_disparity', hue='reassault')

In [34]:
# function to remove any "significant features" that automatically denote target
prepare.remove_phase_2_features(features)

In [35]:
# create some subsets of significant features to assess model performance
top_5 = features[0:5]
top_10 = features[0:10]
top_15 = features[0:15]

# Modeling:

### We will attempt several different tactics and models as a means to find the best fit for our data set.  We will try with various features and par down with what we found as being statistically significant through chi-squared testing.

# Naive Bayes

In [36]:
import model
gnb_model, y_pred, y_pred_proba = model.naive_bayes(features, X_train, y_train)

  y = column_or_1d(y, warn=True)


In [37]:
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb_model.score(X_train[features], y_train)))

Accuracy of GNB classifier on training set: 0.70


In [38]:
# print raw confusion matrix:
# ~~~~~ edit this to display cleaner ~~~~~
print(confusion_matrix(y_train, y_pred))

[[101 149]
 [ 15 282]]


In [39]:
# classification report
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.40      0.55       250
           1       0.65      0.95      0.77       297

   micro avg       0.70      0.70      0.70       547
   macro avg       0.76      0.68      0.66       547
weighted avg       0.75      0.70      0.67       547



In [40]:
gnb, y_pred, y_pred_proba = model.naive_bayes(top_10, X_train, y_train)

  y = column_or_1d(y, warn=True)


In [41]:
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train[top_10], y_train)))

print(confusion_matrix(y_train, y_pred))


print(classification_report(y_train, y_pred))

Accuracy of GNB classifier on training set: 0.74
[[186  64]
 [ 78 219]]
              precision    recall  f1-score   support

           0       0.70      0.74      0.72       250
           1       0.77      0.74      0.76       297

   micro avg       0.74      0.74      0.74       547
   macro avg       0.74      0.74      0.74       547
weighted avg       0.74      0.74      0.74       547



## Logistic Regression w/ Cross Validation: 

In [42]:
clf, y_pred, y_pred_proba = model.log_reg(top_10, X_train, y_train)

  y = column_or_1d(y, warn=True)


In [43]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(clf.score(X_train[top_10], y_train)))

print(confusion_matrix(y_train, y_pred))

print(classification_report(y_train, y_pred))

Accuracy of Logistic Regression classifier on training set: 0.77
[[182  68]
 [ 57 240]]
              precision    recall  f1-score   support

           0       0.76      0.73      0.74       250
           1       0.78      0.81      0.79       297

   micro avg       0.77      0.77      0.77       547
   macro avg       0.77      0.77      0.77       547
weighted avg       0.77      0.77      0.77       547



## Decision Tree:

In [44]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

# # for classificaiton you can change the algorithm as gini or entropy (information gain).  Default is gini.
# dtc = DecisionTreeClassifier(criterion='entropy', max_depth=4, max_features=3, random_state=0)

# grid = GridSearchCV(dtc, params, cv=3, iid=True)

# grid.fit(X_train[top_5], y_train)

# results = grid.cv_results_
# results.keys()

# test_scores = results['mean_test_score']
# test_scores

# params = results['params']
# params

# for p, s in zip(params, test_scores):
#     p['score'] = s

# pd.DataFrame(params).sort_values(by='score')

# dtc.fit(X_train[top_5], y_train)

In [45]:
# cross_val_score(dtc, X_train[top_5], y_train, cv=5)

In [46]:
# y_pred = dtc.predict(X_train[top_5])
# y_pred[0:5]

In [47]:
# y_pred_proba = dtc.predict_proba(X_train[top_5])
# y_pred_proba

In [48]:
dtc, y_pred, y_pred_proba = model.decision_tree(params, top_10, X_train, y_train)

[0.7        0.68181818 0.74311927 0.70642202 0.75229358]


In [57]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train[top_10], y_train)))

print(confusion_matrix(y_train, y_pred))

print(classification_report(y_train, y_pred))

Accuracy of Decision Tree classifier on training set: 0.77
[[169  81]
 [ 50 247]]
              precision    recall  f1-score   support

           0       0.77      0.68      0.72       250
           1       0.75      0.83      0.79       297

   micro avg       0.76      0.76      0.76       547
   macro avg       0.76      0.75      0.76       547
weighted avg       0.76      0.76      0.76       547



In [58]:
confusion_matrix(y_train, y_pred)

array([[169,  81],
       [ 50, 247]])

In [59]:
sorted(y_train.reassault.unique())

[0, 1]

In [60]:
y_train.reassault.value_counts()

1    297
0    250
Name: reassault, dtype: int64

In [61]:
labels = sorted(y_train.reassault.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,169,81
1,50,247


In [62]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.68      0.72       250
           1       0.75      0.83      0.79       297

   micro avg       0.76      0.76      0.76       547
   macro avg       0.76      0.75      0.76       547
weighted avg       0.76      0.76      0.76       547



In [55]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

In [56]:
# for classificaiton you can change the algorithm as gini or entropy (information gain).  Default is gini.
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=4, max_features=3, random_state=0)

NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
grid = GridSearchCV(dtc, params, cv=3, iid=True)

In [None]:
grid.fit(X_train, y_train)

In [None]:
results = grid.cv_results_
results.keys()

In [None]:
test_scores = results['mean_test_score']
test_scores

In [None]:
params = results['params']
params

In [None]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

In [None]:
dtc.fit(X_train[top_10], y_train)

In [None]:
cross_val_score(dtc, X_train[top_10], y_train, cv=5)

In [None]:
y_pred = dtc.predict(X_train[top_10])
y_pred[0:5]

In [None]:
y_pred_proba = dtc.predict_proba(X_train[top_10])
y_pred_proba

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(dtc.score(X_train[top_10], y_train)))

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
labels = sorted(y_train.reassault.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
import graphviz
from io import StringIO
from IPython.display import Image 
import pydotplus
from sklearn import preprocessing
from sklearn import tree

In [None]:
# conda install pydotplus
def plot_decision_tree(clf,feature_name,target_name):
    '''This function creates a visualization of a decision tree in png format. 
    Takes a decision tree object, feature names of the training set and a target variable for the target name.'''
    dot_data = StringIO()  
    tree.export_graphviz(clf, out_file=dot_data,  
                         feature_names=feature_name,  
                         class_names=target_name,  
                         filled=True, rounded=True,  
                         special_characters=True)  
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())

In [None]:
plot_decision_tree(dtc, X_train.columns, df.columns[1])

##  Random Forest:

In [None]:
r_params = {'max_depth': [2, 3, 4]}

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='entropy',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=0)

In [None]:
r_grid = GridSearchCV(rf, r_params, cv=3, iid=True)

In [None]:
r_grid.fit(X_train, y_train)

In [None]:
r_results = r_grid.cv_results_
r_results.keys()

In [None]:
r_test_scores = r_results['mean_test_score']
r_test_scores

In [None]:
r_params = r_results['params']
r_params

In [None]:
for p, s in zip(r_params, r_test_scores):
    p['score'] = s

pd.DataFrame(r_params).sort_values(by='score')

In [None]:
rf.fit(X_train, y_train)

In [None]:
cross_val_score(rf, X_train, y_train, cv=5)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)
y_pred

In [None]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))