# PLAN - PHASE 2

0. [x] Get the data.
0. [x] Prepare the data for exploration.
0. [x] Split the data into train and test sets.
0. [ ] Explore the data.
0. [x] Do cross-validation with the data.
0. [x] Code and evaluate several models.
0. [ ] Evaluate out of sample data (test set).

# ENVIRONMENT

In [1]:
# basic ewnvironment and custom scripts
import os
import acquire
import prepare
import explore

# numpy and pandas for dataframe building and manipulation
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats
import operator

# matplotlib and seaborn for visualization
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns

# import preprocessing for scaling and splitting
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# sklearn machine learning
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# conda install -c glemaitre imbalanced-learn
from imblearn.ensemble import BalancedBaggingClassifier

# to explode the DataFrames and avoid truncation
pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

  from numpy.core.umath_tests import inner1d


# ACQUISITION

In [2]:
# utilize acquire script to get primary dataframe for analysis
dfa, dfb = acquire.get_data()

## _'dfa' : this is the dataframe with the NAW (Not Abused Woman) & AW (Abused Woman) cases._

In [3]:
dfa.shape

(705, 44)

## _'dfb' : this is the dataframe with the AW (Abused Woman) cases_

In [4]:
dfb.shape

(497, 29)

In [5]:
# placeholder cell: perhaps going to utilize these in a later phase

# df9 & 10 are cases of AW (Abused Woman)
df9 = acquire.read_data('data09.csv')
df10 = acquire.read_data('data10.csv')

# df 13, 14 & 15 are cases of HOMICIDE
df13 = acquire.read_data('data13.csv')
df14 = acquire.read_data('data14.csv')
df15 = acquire.read_data('data15.csv')

# PREPARATION

## _Examining the dataframes, dfa & dfb, for any missing or inconsistent values._

In [6]:
dfa.head(8).T

Unnamed: 0,0,1,2,3,4,5,6,7
CASEID,108395,108397,108399,108443,108444,108456,108460,108525
ABUSED,1,1,1,1,1,1,1,1
SCRSTATR,1,1,1,1,1,1,1,1
LENGTHC1,1,2,5,2,2,2,3,4
C1SITUAT,1,1,1,1,1,3,1,4
PABUSE,1,1,1,1,1,1,1,3
D3RCHILT,1,0,6,0,1,1,1,2
E13PRGNT,3,3,3,1,2,3,888,3
N7PREGNT,2,3,2,2,2,2,1,2
TOTSUPRT,9,11,9,12,11,2,10,11


In [7]:
dfb.head(8).T

Unnamed: 0,0,1,2,3,4,5,6,7
CASEID,108395,108397,108399,108443,108444,108456,108460,108525
M5FIRED,9999,9999,9999,9999,9999,9999,9999,1
M11HIGH,2,2,2,2,1,1,2,2
M35SAFE,1,1,1,1,1,1,1,1
M41ILLGL,2,2,2,2,2,2,2,1
M42DAGRR,0,0,0,0,0,1,0,1
M13TALKR,1,1,1,1,1,1,1,1
M32OTHER,99,99,99,13,99,11,11,41
M27HOW,99999,99999,99999,99999,99999,11,99999,21
M30ARRES,3,3,3,3,3,0,3,1


In [8]:
prepare.get_nulls_by_column(dfa)

Empty DataFrame
Columns: [sum_nulls, nulls_by_percent]
Index: []


In [9]:
prepare.get_nulls_by_column(dfb)

Empty DataFrame
Columns: [sum_nulls, nulls_by_percent]
Index: []


In [10]:
prepare.get_nulls_by_row(dfa)

In [11]:
prepare.get_nulls_by_row(dfb)

In [12]:
prepare.summarize_data(dfa)

HEAD
   CASEID  ABUSED  SCRSTATR  LENGTHC1  C1SITUAT  PABUSE  D3RCHILT  E13PRGNT  \
0  108395       1         1         1         1       1         1         3   
1  108397       1         1         2         1       1         0         3   
2  108399       1         1         5         1       1         6         3   
3  108443       1         1         2         1       1         0         1   
4  108444       1         1         2         1       1         1         2   

   N7PREGNT  TOTSUPRT    ...     N16CHILD  N17ARRST  N1FRQNCY  N2SVRITY  \
0         2         9    ...            2         2         2         2   
1         3        11    ...            2         1         2         1   
2         2         9    ...            2         2         2         2   
3         2        12    ...            2         1         2         2   
4         2        11    ...            2         1         1         1   

   N3WEAPON  N4CHOKE  N5SEX  N6CONTRL  N8JEALUS  N10CPBLE  
0        

In [13]:
prepare.summarize_data(dfb)

HEAD
   CASEID  M5FIRED  M11HIGH  M35SAFE  M41ILLGL  M42DAGRR  M13TALKR  M32OTHER  \
0  108395     9999        2        1         2         0         1        99   
1  108397     9999        2        1         2         0         1        99   
2  108399     9999        2        1         2         0         1        99   
3  108443     9999        2        1         2         0         1        13   
4  108444     9999        1        1         2         0         1        99   

   M27HOW  M30ARRES    ...     FORCEDR  MISCARR  RESTRAIN  CHOKED  NDRUNK  \
0   99999         3    ...           0        0         0       0       0   
1   99999         3    ...           0        0         0       1       2   
2   99999         3    ...           0        0         1       1       0   
3   99999         3    ...           0        0         0       1       1   
4   99999         3    ...           0        0         0       0       3   

   RDRUNK  BOTHDRUN  NDRUGS  RDRUGS  BOTHDRUG  
0  

## _We will utilize the first dataframe's Case IDs to group our incident data from set 10 to group by incident and identify surveyed individuals that were victim to more than one instance of abuse_

In [14]:
dfa['RECID'] = dfa.CASEID.apply(prepare.get_repeat_case)

## _This new column, RECID, is a boolean value that tells us whether the cases were re-assault cases or not._

In [15]:
# look at unaltered value counts
prepare.value_counts(dfa)

CASEID

-------------------------------------------------------------

ABUSED
1    497
2    208
Name: ABUSED, dtype: int64

-------------------------------------------------------------

SCRSTATR
1      455
2      118
3      115
999     17
Name: SCRSTATR, dtype: int64

-------------------------------------------------------------

LENGTHC1
1    159
2    153
3     83
4    105
5    147
6     58
Name: LENGTHC1, dtype: int64

-------------------------------------------------------------

C1SITUAT
1    395
2     18
3     74
4     10
9    208
Name: C1SITUAT, dtype: int64

-------------------------------------------------------------

PABUSE
1    464
2     27
3      6
9    208
Name: PABUSE, dtype: int64

-------------------------------------------------------------

D3RCHILT
0      339
1      136
2      113
3       57
4       32
5        9
6        9
7        5
12       1
555      1
666      1
999      2
Name: D3RCHILT, dtype: int64

-----------------------------------------------------------

In [16]:
# change dfa to df to match conventions of prepare.py
df = dfa
#use prepare function to rename features 
dfa = prepare.rename_columns_all(df)
# dfa = df

In [17]:
dfa.head()

Unnamed: 0,id,abuse_past_year,abuse_status,length_relationship,partner_abusive,num_abusers,num_children,pregnant,beaten_while_pregnant,support_score,...,partner_arrested,violence_increased,severity_increased,weapon_ever,choked_ever,rape_ever,controlled_ever,jealous,capable_murder,reassault
0,108395,1,1,1,1,1,1,3,2,9,...,2,2,2,2,2,2,2,2,2,0
1,108397,1,1,2,1,1,0,3,3,11,...,1,2,1,1,1,1,2,1,1,1
2,108399,1,1,5,1,1,6,3,2,9,...,2,2,2,2,1,2,2,2,2,1
3,108443,1,1,2,1,1,0,1,2,12,...,1,2,2,1,1,2,1,2,2,0
4,108444,1,1,2,1,1,1,2,2,11,...,1,1,1,2,2,2,2,2,2,1


In [18]:
#use prepare function to rename features 
prepare.rename_columns_recid(dfb)

Unnamed: 0,id,gun_fired,anyone_high,safe_place,forced_illegal,life_danger,talk_about_it,left_or_not,medical_staff_helpful,perp_arrested_ever,...,num_forced_sex,miscarriage_resulted,restrained_by_perp,num_choked,num_perp_drunk,num_woman_drunk,num_both_drunk,num_perp_drugs,num_woman_drugs,num_both_drugs
0,108395,9999,2,1,2,0,1,99,99999,3,...,0,0,0,0,0,0,0,0,0,0
1,108397,9999,2,1,2,0,1,99,99999,3,...,0,0,0,1,2,0,0,0,0,0
2,108399,9999,2,1,2,0,1,99,99999,3,...,0,0,1,1,0,0,0,0,0,0
3,108443,9999,2,1,2,0,1,13,99999,3,...,0,0,0,1,1,0,0,0,0,0
4,108444,9999,1,1,2,0,1,99,99999,3,...,0,0,0,0,3,0,0,0,0,0
5,108456,9999,1,1,2,1,1,11,11,0,...,0,0,0,0,0,0,0,0,0,0
6,108460,9999,2,1,2,0,1,11,99999,3,...,0,0,0,0,0,0,0,0,0,0
7,108525,1,2,1,1,1,1,41,21,1,...,0,0,0,0,0,0,0,0,0,0
8,108562,9999,1,1,2,0,1,99,99999,3,...,0,0,0,0,2,0,0,0,0,0
9,118051,9999,999,999,999,999,999,99,99999,999,...,0,0,0,0,0,0,0,0,0,0


In [19]:
dfb.head()

Unnamed: 0,CASEID,M5FIRED,M11HIGH,M35SAFE,M41ILLGL,M42DAGRR,M13TALKR,M32OTHER,M27HOW,M30ARRES,...,FORCEDR,MISCARR,RESTRAIN,CHOKED,NDRUNK,RDRUNK,BOTHDRUN,NDRUGS,RDRUGS,BOTHDRUG
0,108395,9999,2,1,2,0,1,99,99999,3,...,0,0,0,0,0,0,0,0,0,0
1,108397,9999,2,1,2,0,1,99,99999,3,...,0,0,0,1,2,0,0,0,0,0
2,108399,9999,2,1,2,0,1,99,99999,3,...,0,0,1,1,0,0,0,0,0,0
3,108443,9999,2,1,2,0,1,13,99999,3,...,0,0,0,1,1,0,0,0,0,0
4,108444,9999,1,1,2,0,1,99,99999,3,...,0,0,0,0,3,0,0,0,0,0


In [20]:
# run function to replace nonvalues and change values to conventionally boolean archetypes
prepare.replace_nonvals_all(dfa)

In [21]:
# run function to replace nonvalues and change values to conventionally boolean archetypes
prepare.replace_nonvals_recid(dfb)

In [22]:
dfa

Unnamed: 0,id,abuse_past_year,abuse_status,length_relationship,partner_abusive,num_abusers,num_children,pregnant,beaten_while_pregnant,support_score,...,partner_arrested,violence_increased,severity_increased,weapon_ever,choked_ever,rape_ever,controlled_ever,jealous,capable_murder,reassault
0,108395,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,108397,1,1,2,1,1,0,0,0,11,...,1,0,1,1,1,1,0,1,1,1
2,108399,1,1,5,1,1,2,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,108443,1,1,2,1,1,0,1,0,12,...,1,0,0,1,1,0,1,0,0,0
4,108444,1,1,2,1,1,1,0,0,11,...,1,1,1,0,0,0,0,0,0,1
5,108456,1,1,2,0,1,1,0,0,0,...,1,0,0,0,1,1,0,1,1,1
6,108460,1,1,3,1,1,1,0,1,10,...,0,0,0,0,0,0,0,1,1,1
7,108525,1,1,4,1,2,2,0,0,11,...,1,1,1,1,1,1,1,1,1,1
8,108562,1,1,5,1,1,2,0,0,11,...,0,0,0,1,0,0,1,1,1,1
9,109304,0,0,5,0,0,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0


In [23]:
dfa = dfa[dfa.abuse_past_year == 1]

In [24]:
dfa.columns

Index(['id', 'abuse_past_year', 'abuse_status', 'length_relationship',
       'partner_abusive', 'num_abusers', 'num_children', 'pregnant',
       'beaten_while_pregnant', 'support_score', 'guns_in_home',
       'jealous_past_year', 'limit_family_contact', 'location_tracking',
       'threat_hit', 'threat_object', 'push_shove', 'slap', 'kick_punch',
       'hit_object', 'beaten', 'choked', 'threat_knife', 'threat_gun',
       'rape_with_threat', 'power_scale', 'harass_scale', 'id_age',
       'age_disparity', 'children_not_partner', 'same_sex_relationship',
       'partner_drug_use', 'partner_alcohol_use', 'threat_suicide',
       'partner_reported_child_abuse', 'partner_arrested',
       'violence_increased', 'severity_increased', 'weapon_ever',
       'choked_ever', 'rape_ever', 'controlled_ever', 'jealous',
       'capable_murder', 'reassault'],
      dtype='object')

In [25]:
dfa = dfa.drop(columns=['guns_in_home',
                        'threat_hit',
                        'beaten',
                        'choked',
                        'threat_knife',
                        'threat_gun',
                        'rape_with_threat',
                        'partner_drug_use',
                        'partner_alcohol_use',
                        'weapon_ever',
                        'choked_ever',
                        'jealous_past_year'
                       ])

In [26]:
dfa.shape

(497, 33)

In [27]:
dfb.columns

Index(['CASEID', 'M5FIRED', 'M11HIGH', 'M35SAFE', 'M41ILLGL', 'M42DAGRR',
       'M13TALKR', 'M32OTHER', 'M27HOW', 'M30ARRES', 'M31HOW', 'M38ORDER',
       'SEVERER', 'TOTINCR', 'THREATR', 'SLAPR', 'PUNCHR', 'BEATR', 'UWEAPON',
       'FORCEDR', 'MISCARR', 'RESTRAIN', 'CHOKED', 'NDRUNK', 'RDRUNK',
       'BOTHDRUN', 'NDRUGS', 'RDRUGS', 'BOTHDRUG'],
      dtype='object')

In [28]:
dfb = dfb.drop(columns=['police_resp',
                        'order_protection',
                        'num_woman_drunk',
                        'num_perp_drunk',
                        'num_woman_drugs',
                        'num_perp_drugs',
                       ])

KeyError: "['police_resp' 'order_protection' 'num_woman_drunk' 'num_perp_drunk'\n 'num_woman_drugs' 'num_perp_drugs'] not found in axis"

In [None]:
dfb.shape

In [None]:
dfa.reassault.value_counts()

## _Merge dfa and dfb on ID._

In [None]:
dfb = dfa.merge(right=dfb, on='id')

In [None]:
dfb.shape

In [None]:
dfb.head()

In [None]:
dfb.columns

In [None]:
dfb.head(5).T

In [None]:
dfb.reassault.value_counts()

In [None]:
prepare.get_nulls_by_column(dfb)

## Train / Test Split

In [None]:
X = dfb.drop(columns=['reassault'])
y = dfb[['reassault']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .15, random_state = 0, stratify=y)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [None]:
train_df.head()

In [None]:
test_df.head()

# EXPLORATION

In [None]:
def plot_hist(df):
    """
    Plots the distribution of the dataframe's variables.
    """
    df.hist(figsize=(24, 20), bins=20)

In [None]:
df = dfb.copy()

In [None]:
plot_hist(df)

# ---------- DO NOT TOUCH ABOVE ----------

In [None]:
df.reassault.value_counts()

In [None]:
df.columns

In [None]:
# we will refer to this list of variables as 'keep'
keep = list(df.columns)


In [None]:
train_df.info()


In [None]:
keep

In [None]:
keep.remove('id')
keep.remove('length_relationship')
keep.remove('id_age')
keep.remove('reassault')

In [None]:
# add the continuous variables into their own list
continuous_vars = ['length_relationship', 'id_age']

In [None]:
keep

In [None]:
# we will run chi-squared tests on our categorical variables in order to get a grasp of importance
sig_feats, sig_dict = explore.get_chi_squared(train_df, keep, 'reassault')

In [None]:
df.shape

In [None]:
features = [item[0] for item in explore.sort_sigs(sig_dict)]

In [None]:
explore.make_bars(train_df, 'reassault', features)

In [None]:
explore.swarrrm(train_df, 'reassault', continuous_vars)

In [None]:
for feature in continuous_vars:
    sns.lineplot(x=feature, y='abuse_past_year', data=train_df)
    plt.show()

In [None]:
sig_feats

In [None]:
sig_dict

In [None]:
import operator
val_list = []
for key in sig_dict:
    val_list.append(sig_dict[key])
sorted_vals = sorted(sig_dict.items(), key=operator.itemgetter(1), reverse=True) 
sorted_vals

In [None]:
plt.figure(figsize=(8,4))
sns.heatmap(df.corr(), cmap='Blues', annot=True)

# ---------- DO NOT TOUCH BELOW ----------

# MODELING

In [None]:
df = dfb.copy()

In [None]:
df.shape

In [None]:
df.head(8).T

In [None]:
X_train = X_train.drop(columns=['limit_family_contact',
             'slap',
             'kick_punch',
             'hit_object',
             'power_scale',
             'harass_scale',
             'violence_increased',
             'severity_increased',
             'rape_ever',
             'jealous',
             'life_danger',
             'perp_arrested_ever',
             'level_severity',
             'num_incidents',
             'num_threats',
             'num_slapping',
             'num_punching',
             'num_beating',
             'num_forced_sex',
             'num_choked'
                               ])
X_test = X_test.drop(columns=['limit_family_contact',
             'slap',
             'kick_punch',
             'hit_object',
             'power_scale',
             'harass_scale',
             'violence_increased',
             'severity_increased',
             'rape_ever',
             'jealous',
             'life_danger',
             'perp_arrested_ever',
             'level_severity',
             'num_incidents',
             'num_threats',
             'num_slapping',
             'num_punching',
             'num_beating',
             'num_forced_sex',
             'num_choked'
                               ])

## 1. Logistic Regression with Cross Validation

In [None]:
lr = LogisticRegressionCV(cv=5,
                           random_state=0,
                          ).fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_train)

In [None]:
y_pred_proba = lr.predict_proba(X_train)

In [None]:
y_pred_proba

In [None]:
lr.score(X_train, y_train) 

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(lr.score(X_train, y_train)))

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
print(classification_report(y_train, y_pred))

## 2. Decision Tree

In [None]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 2, 3]}

In [None]:
# for classificaiton you can change the algorithm as gini or entropy (information gain).  Default is gini.
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features=1, random_state=0)

In [None]:
grid = GridSearchCV(dtc, params, cv=3, iid=True)

In [None]:
grid.fit(X_train, y_train)

In [None]:
results = grid.cv_results_
results.keys()

In [None]:
test_scores = results['mean_test_score']
test_scores

In [None]:
params = results['params']
params

In [None]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

In [None]:
dtc.fit(X_train, y_train)

In [None]:
cross_val_score(dtc, X_train, y_train, cv=5)

In [None]:
y_pred = dtc.predict(X_train)
y_pred[0:5]

In [None]:
_pred_proba = dtc.predict_proba(X_train)
y_pred_proba

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(dtc.score(X_train, y_train)))

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
sorted(y_train.reassault.unique())

In [None]:
y_train.reassault.value_counts()

In [None]:
labels = sorted(y_train.reassault.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

## 3. Random Forest

In [None]:
r_params = {'max_depth': [2, 3, 4]}

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='entropy',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=0)

In [None]:
r_grid = GridSearchCV(rf, r_params, cv=3, iid=True)

In [None]:
r_grid.fit(X_train, y_train)

In [None]:
r_results = r_grid.cv_results_
r_results.keys()

In [None]:
r_test_scores = r_results['mean_test_score']
r_test_scores

In [None]:
r_params = r_results['params']
r_params

In [None]:
for p, s in zip(r_params, r_test_scores):
    p['score'] = s

pd.DataFrame(r_params).sort_values(by='score')

In [None]:
rf.fit(X_train, y_train)

In [None]:
cross_val_score(rf, X_train, y_train, cv=5)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)
y_pred

In [None]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

## Out of Sample Performance

In [None]:
print('Accuracy of Random Forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))