# PLAN - PHASE 2

0. [x] Get the data.
0. [x] Prepare the data for exploration.
0. [ ] Split the data into train and test sets.
0. [ ] Explore the data.
0. [ ] Do cross-validation with the data.
0. [ ] Code and evaluate several models.
0. [ ] Evaluate out of sample data (test set).

# ENVIRONMENT

In [1]:
import os
import acquire
import prepare
import explore as exp

import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats
import operator

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

# conda install -c glemaitre imbalanced-learn
from imblearn.ensemble import BalancedBaggingClassifier

# to explode the DataFrames and avoid truncation
pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

  from numpy.core.umath_tests import inner1d


# ACQUISITION

In [2]:
df1 = acquire.read_data('data01.csv')
df2 = acquire.read_data('data02.csv')
df3 = acquire.read_data('data03.csv')
df4 = acquire.read_data('data04.csv')
df5 = acquire.read_data('data05.csv')
df7 = acquire.read_data('data07.csv')

In [3]:
# Specify columns to take from dataframes 1, 2, 3, 4, 5, and 7:


one_cols = ['CASEID',
            'ABUSED',
            'SCRSTATR',
            'LENGTHC1',
            'C1SITUAT',
            'PABUSE',
             ]


two_cols = ['CASEID',
            'D3RCHILT',
           ]

three_cols = ['CASEID',
              'E13PRGNT',
              'N7PREGNT',
              'TOTSUPRT'
             ]

four_cols = ['CASEID',
             'G1NUMBER', 
             'H1JEALUS', 
             'H2LIMIT',
             'H3KNOWNG',
             'J1HIT',
             'J2THROWN',
             'J3PUSH',
             'J4SLAP',
             'J5KICK',
             'J6OBJECT',
             'J7BEAT',
             'J8CHOKE',
             'J9KNIFE',
             'J10GUN',
             'J11SEX',
             'POWER',
             'HARASS',
            ]

five_cols = ['CASEID', 
             'B1AGE',
             'AGEDISP',
             'STDETAI',
            ]

seven_cols = ['CASEID',
              'SAMESEXR',
              'N11DRUGS',
              'N12ALCHL',
              'N13SUHIM',
              'N16CHILD',
              'N17ARRST',
              'N1FRQNCY',
              'N2SVRITY',
              'N3WEAPON',
              'N4CHOKE',
              'N5SEX',
              'N6CONTRL',
              'N8JEALUS',
              'N10CPBLE',
             ]

In [4]:
# drop dataframes down to desired features
df1 = df1[one_cols]
df2 = df2[two_cols]
df3 = df3[three_cols]
df4 = df4[four_cols]
df5 = df5[five_cols]
df7 = df7[seven_cols]

In [5]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
print(df5.shape)
print(df7.shape)

(705, 6)
(705, 2)
(705, 4)
(705, 18)
(705, 4)
(705, 15)


In [6]:
dfa = df1.merge(right=df2, on='CASEID')
dfa = dfa.merge(right=df3, on='CASEID')
dfa = dfa.merge(right=df4, on='CASEID')
dfa = dfa.merge(right=df5, on='CASEID')
dfa = dfa.merge(right=df7, on='CASEID')

In [7]:
dfa.shape

(705, 44)

## _'dfa' : this is the dataframe with the NAW (Not Abused Woman) & AW (Abused Woman) cases._

In [8]:
df6 = acquire.read_data('data06.csv')
# df8 = acquire.read_data('data08.csv') # NO USEFUL COLUMNS
df11 = acquire.read_data('data11.csv')
# df12 = acquire.read_data('data12.csv') # NO USEFUL COLUMNS

In [9]:
print(df6.head())
# print(df8.head())
print(df11.head())
# print(df12.head())

   CASEID  INVIEWN  ABUSED  SAMESEXR  RACER  M4GUN  M5FIRED  M6MSCRRG  \
0  108395        1       1      9999      1      2     9999         2   
1  108397        1       1      9999      9      2     9999         2   
2  108399        1       1      9999      1      2     9999         2   
3  108443        1       1      9999      1      2     9999         2   
4  108444        1       1      9999      1      2     9999         2   

   M7TIEDUP  M8PRSNT    ...     M40CNSL3  M40CNSL4  M44ADV1  M44ADV2  M44ADV3  \
0         2        2    ...         9999      9999        1     9999     9999   
1         2        1    ...         9999      9999        1     9999     9999   
2         1        1    ...         9999      9999        2        1     9999   
3         2        1    ...         9999      9999       26     9999     9999   
4         2        1    ...         9999      9999        1     9999     9999   

   M22NOINJ  M22NONED  M22PRIV  M22EMBAR  HELPTYPE  
0         1         0

In [10]:
# Specify columns to take from dataframes 6 and 11:


six_cols = ['CASEID',
            'M5FIRED',
            'M11HIGH',
            'M35SAFE',
            'M41ILLGL',
            'M42DAGRR',
            'M13TALKR',
            'M32OTHER',
            'M27HOW',
            'M30ARRES',
            'M31HOW',
            'M38ORDER',
           ]

eleven_cols = ['CASEID',
               'SEVERER',
               'TOTINCR',
               'THREATR',
               'SLAPR',
               'PUNCHR',
               'BEATR',
               'UWEAPON',
               'FORCEDR',
               'MISCARR',
               'RESTRAIN',
               'CHOKED',
               'NDRUNK',
               'RDRUNK',
               'BOTHDRUN',
               'NDRUGS',
               'RDRUGS',
               'BOTHDRUG',
              ]

In [11]:
# drop dataframes down to desired features
df6 = df6[six_cols]
df11 = df11[eleven_cols]

In [12]:
print(df6.shape)
print(df11.shape)

(497, 12)
(497, 18)


In [13]:
dfb = df6.merge(right=df11, on='CASEID')

## _'dfb' : this is the dataframe with the AW (Abused Woman) cases_

In [14]:
# df9 & 10 are cases of AW (Abused Woman)
df9 = acquire.read_data('data09.csv')
df10 = acquire.read_data('data10.csv')

# df 13, 14 & 15 are cases of HOMICIDE
df13 = acquire.read_data('data13.csv')
df14 = acquire.read_data('data14.csv')
df15 = acquire.read_data('data15.csv')

# PREPARATION

In [15]:
def missing_values_col(df):
    """
    Thids function returns the total missing values
    and the percent missing values by column.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    return pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})


def missing_values_row(df):
    """
    Write or use a previously written function to return the
    total missing values and the percent missing values by row.
    """
    null_count = df.isnull().sum(axis=1)
    null_percentage = (null_count / df.shape[1]) * 100
    return pd.DataFrame({'num_missing': null_count, 'percentage': null_percentage})


def handle_missing_threshold(df, prop_required_column = .3, prop_required_row = .9):
    """
    Removes columns and rows whose count of missing values exceeds threshold.
    """
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def count_values(column):
    return df[column].value_counts(dropna=False)

def summarize_data(df):
    
    df_head = df.head()
    print(f'HEAD\n{df_head}', end='\n\n')
   
    df_tail = df.tail()
    print(f'TAIL\n{df_tail}', end='\n\n')

    shape_tuple = df.shape
    print(f'SHAPE: {shape_tuple}', end='\n\n')
    
    df_describe = df.describe()
    print(f'DESCRIPTION\n{df_describe}', end='\n\n')
    
    df.info()
    print(f'INFORMATION')    

    print(f'VALUE COUNTS', end='\n\n')
    for col in df.columns:
        n = df[col].unique().shape[0]
        col_bins = min(n, 10)
        print(f'{col}:')
        if df[col].dtype in ['int64', 'float64'] and n > 10:
            print(df[col].value_counts(bins=col_bins, sort=False, dropna=False))
        else:
            print(df[col].value_counts(dropna=False))
        print('\n')

In [16]:
missing_values_col(dfa)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,nan_count,nan_percentage
CASEID,0,0.0,0,0.0,0,0.0
ABUSED,0,0.0,0,0.0,0,0.0
SCRSTATR,0,0.0,0,0.0,0,0.0
LENGTHC1,0,0.0,0,0.0,0,0.0
C1SITUAT,0,0.0,0,0.0,0,0.0
PABUSE,0,0.0,0,0.0,0,0.0
D3RCHILT,0,0.0,0,0.0,0,0.0
E13PRGNT,0,0.0,0,0.0,0,0.0
N7PREGNT,0,0.0,0,0.0,0,0.0
TOTSUPRT,0,0.0,0,0.0,0,0.0


In [17]:
missing_values_col(dfb)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,nan_count,nan_percentage
CASEID,0,0.0,0,0.0,0,0.0
M5FIRED,0,0.0,0,0.0,0,0.0
M11HIGH,0,0.0,0,0.0,0,0.0
M35SAFE,0,0.0,0,0.0,0,0.0
M41ILLGL,0,0.0,0,0.0,0,0.0
M42DAGRR,0,0.0,0,0.0,0,0.0
M13TALKR,0,0.0,0,0.0,0,0.0
M32OTHER,0,0.0,0,0.0,0,0.0
M27HOW,0,0.0,0,0.0,0,0.0
M30ARRES,0,0.0,0,0.0,0,0.0


In [18]:
missing_values_row(dfa)

Unnamed: 0,num_missing,percentage
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
5,0,0.0
6,0,0.0
7,0,0.0
8,0,0.0
9,0,0.0


In [19]:
missing_values_row(dfb)

Unnamed: 0,num_missing,percentage
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
5,0,0.0
6,0,0.0
7,0,0.0
8,0,0.0
9,0,0.0


In [20]:
summarize_data(dfa)

HEAD
   CASEID  ABUSED  SCRSTATR  LENGTHC1  C1SITUAT  PABUSE  D3RCHILT  E13PRGNT  \
0  108395       1         1         1         1       1         1         3   
1  108397       1         1         2         1       1         0         3   
2  108399       1         1         5         1       1         6         3   
3  108443       1         1         2         1       1         0         1   
4  108444       1         1         2         1       1         1         2   

   N7PREGNT  TOTSUPRT    ...     N16CHILD  N17ARRST  N1FRQNCY  N2SVRITY  \
0         2         9    ...            2         2         2         2   
1         3        11    ...            2         1         2         1   
2         2         9    ...            2         2         2         2   
3         2        12    ...            2         1         2         2   
4         2        11    ...            2         1         1         1   

   N3WEAPON  N4CHOKE  N5SEX  N6CONTRL  N8JEALUS  N10CPBLE  
0        

In [21]:
summarize_data(dfb)

HEAD
   CASEID  M5FIRED  M11HIGH  M35SAFE  M41ILLGL  M42DAGRR  M13TALKR  M32OTHER  \
0  108395     9999        2        1         2         0         1        99   
1  108397     9999        2        1         2         0         1        99   
2  108399     9999        2        1         2         0         1        99   
3  108443     9999        2        1         2         0         1        13   
4  108444     9999        1        1         2         0         1        99   

   M27HOW  M30ARRES    ...     FORCEDR  MISCARR  RESTRAIN  CHOKED  NDRUNK  \
0   99999         3    ...           0        0         0       0       0   
1   99999         3    ...           0        0         0       1       2   
2   99999         3    ...           0        0         1       1       0   
3   99999         3    ...           0        0         0       1       1   
4   99999         3    ...           0        0         0       0       3   

   RDRUNK  BOTHDRUN  NDRUGS  RDRUGS  BOTHDRUG  
0  

In [22]:
def make_repeat_series(df10):
    '''takes a dataframe with a caseid columns and returns a series with offense numbers using a groupby'''
    repeat_series = df10.groupby('CASEID').INCIDENT.count()
    return repeat_series

def over_1(repeat_series):
    '''takes a pandas series and tests for a value to put in a list of caseIDs that are repeat offenses'''
    repeat_cases = []
    for case, inc_num in enumerate(repeat_series):
        if inc_num > 1:
            repeat_cases.append(repeat_series.index[case])
    return repeat_cases

def get_repeat_case(val):
    '''takes a value and establishes if it meets criteria to be in repeat offenses'''
    repeat_cases = over_1(make_repeat_series(df10))
    if val in repeat_cases:
        return 1
    else:
        return 0

dfa['RECID'] = dfb.CASEID.apply(get_repeat_case)

In [23]:
dfa.head()

Unnamed: 0,CASEID,ABUSED,SCRSTATR,LENGTHC1,C1SITUAT,PABUSE,D3RCHILT,E13PRGNT,N7PREGNT,TOTSUPRT,...,N17ARRST,N1FRQNCY,N2SVRITY,N3WEAPON,N4CHOKE,N5SEX,N6CONTRL,N8JEALUS,N10CPBLE,RECID
0,108395,1,1,1,1,1,1,3,2,9,...,2,2,2,2,2,2,2,2,2,0.0
1,108397,1,1,2,1,1,0,3,3,11,...,1,2,1,1,1,1,2,1,1,1.0
2,108399,1,1,5,1,1,6,3,2,9,...,2,2,2,2,1,2,2,2,2,1.0
3,108443,1,1,2,1,1,0,1,2,12,...,1,2,2,1,1,2,1,2,2,0.0
4,108444,1,1,2,1,1,1,2,2,11,...,1,1,1,2,2,2,2,2,2,1.0


In [24]:
dfb.head()

Unnamed: 0,CASEID,M5FIRED,M11HIGH,M35SAFE,M41ILLGL,M42DAGRR,M13TALKR,M32OTHER,M27HOW,M30ARRES,...,FORCEDR,MISCARR,RESTRAIN,CHOKED,NDRUNK,RDRUNK,BOTHDRUN,NDRUGS,RDRUGS,BOTHDRUG
0,108395,9999,2,1,2,0,1,99,99999,3,...,0,0,0,0,0,0,0,0,0,0
1,108397,9999,2,1,2,0,1,99,99999,3,...,0,0,0,1,2,0,0,0,0,0
2,108399,9999,2,1,2,0,1,99,99999,3,...,0,0,1,1,0,0,0,0,0,0
3,108443,9999,2,1,2,0,1,13,99999,3,...,0,0,0,1,1,0,0,0,0,0
4,108444,9999,1,1,2,0,1,99,99999,3,...,0,0,0,0,3,0,0,0,0,0


In [25]:
# look at unaltered value counts
prepare.value_counts(dfa)

CASEID

-------------------------------------------------------------

ABUSED
1    497
2    208
Name: ABUSED, dtype: int64

-------------------------------------------------------------

SCRSTATR
1      455
2      118
3      115
999     17
Name: SCRSTATR, dtype: int64

-------------------------------------------------------------

LENGTHC1
1    159
2    153
3     83
4    105
5    147
6     58
Name: LENGTHC1, dtype: int64

-------------------------------------------------------------

C1SITUAT
1    395
2     18
3     74
4     10
9    208
Name: C1SITUAT, dtype: int64

-------------------------------------------------------------

PABUSE
1    464
2     27
3      6
9    208
Name: PABUSE, dtype: int64

-------------------------------------------------------------

D3RCHILT
0      339
1      136
2      113
3       57
4       32
5        9
6        9
7        5
12       1
555      1
666      1
999      2
Name: D3RCHILT, dtype: int64

-----------------------------------------------------------

In [26]:
# change dfa to df to match conventions of prepare.py
df = dfa
#use prepare function to rename features 
prepare.rename_columns_all(df)
dfa = df

In [27]:
dfa.head()

Unnamed: 0,id,abuse_past_year,abuse_status,length_relationship,partner_abusive,num_abusers,num_children,pregnant,beaten_while_pregnant,support_score,...,partner_arrested,violence_increased,severity_increased,weapon_ever,choked_ever,rape_ever,controlled_ever,jealous,capable_murder,reassault
0,108395,1,1,1,1,1,1,3,2,9,...,2,2,2,2,2,2,2,2,2,0.0
1,108397,1,1,2,1,1,0,3,3,11,...,1,2,1,1,1,1,2,1,1,1.0
2,108399,1,1,5,1,1,6,3,2,9,...,2,2,2,2,1,2,2,2,2,1.0
3,108443,1,1,2,1,1,0,1,2,12,...,1,2,2,1,1,2,1,2,2,0.0
4,108444,1,1,2,1,1,1,2,2,11,...,1,1,1,2,2,2,2,2,2,1.0


In [28]:
#use prepare function to rename features 
prepare.rename_columns_recid(dfb)

In [29]:
dfb.head()

Unnamed: 0,id,gun_fired,anyone_high,safe_place,forced_illegal,life_danger,talk_about_it,left_or_not,medical_staff_helpful,perp_arrested_ever,...,num_forced_sex,miscarriage_resulted,restrained_by_perp,num_choked,num_perp_drunk,num_woman_drunk,num_both_drunk,num_perp_drugs,num_woman_drugs,num_both_drugs
0,108395,9999,2,1,2,0,1,99,99999,3,...,0,0,0,0,0,0,0,0,0,0
1,108397,9999,2,1,2,0,1,99,99999,3,...,0,0,0,1,2,0,0,0,0,0
2,108399,9999,2,1,2,0,1,99,99999,3,...,0,0,1,1,0,0,0,0,0,0
3,108443,9999,2,1,2,0,1,13,99999,3,...,0,0,0,1,1,0,0,0,0,0
4,108444,9999,1,1,2,0,1,99,99999,3,...,0,0,0,0,3,0,0,0,0,0


In [30]:
# run function to replace nonvalues and change values to conventionally boolean archetypes
prepare.replace_nonvals_all(dfa)

In [31]:
# run function to replace nonvalues and change values to conventionally boolean archetypes
prepare.replace_nonvals_recid(dfb)

In [32]:
dfa = dfa[dfa.abuse_past_year == 1]

In [33]:
dfa.columns

Index(['id', 'abuse_past_year', 'abuse_status', 'length_relationship',
       'partner_abusive', 'num_abusers', 'num_children', 'pregnant',
       'beaten_while_pregnant', 'support_score', 'guns_in_home',
       'jealous_past_year', 'limit_family_contact', 'location_tracking',
       'threat_hit', 'thrown_object', 'push_shove', 'slap', 'kick_punch',
       'hit_object', 'beaten', 'choked', 'threat_knife', 'threat_gun',
       'rape_with_threat', 'power_scale', 'harass_scale', 'id_age',
       'age_disparity', 'children_not_partner', 'same_sex_relationship',
       'partner_drug_use', 'partner_alcohol_use', 'threat_suicide',
       'partner_reported_child_abuse', 'partner_arrested',
       'violence_increased', 'severity_increased', 'weapon_ever',
       'choked_ever', 'rape_ever', 'controlled_ever', 'jealous',
       'capable_murder', 'reassault'],
      dtype='object')

In [34]:
dfa = dfa.drop(columns=['guns_in_home',
                        'threat_hit',
                        'beaten',
                        'choked',
                        'threat_knife',
                        'threat_gun',
                        'rape_with_threat',
                        'partner_drug_use',
                        'partner_alcohol_use',
                        'weapon_ever',
                        'choked_ever',
                        'jealous_past_year'
                       ])

In [35]:
dfa.shape

(497, 33)

In [36]:
dfb.columns

Index(['id', 'gun_fired', 'anyone_high', 'safe_place', 'forced_illegal',
       'life_danger', 'talk_about_it', 'left_or_not', 'medical_staff_helpful',
       'perp_arrested_ever', 'police_resp', 'order_protection',
       'level_severity', 'num_incidents', 'num_threats', 'num_slapping',
       'num_punching', 'num_beating', 'num_weapon', 'num_forced_sex',
       'miscarriage_resulted', 'restrained_by_perp', 'num_choked',
       'num_perp_drunk', 'num_woman_drunk', 'num_both_drunk', 'num_perp_drugs',
       'num_woman_drugs', 'num_both_drugs'],
      dtype='object')

In [37]:
# dfb = dfb.drop(columns=['M5FIRED',
#                         'M27HOW',
#                         'police_resp',
#                         'order_protection',
#                         'num_woman_drunk',
#                         'num_perp_drunk',
#                         'num_woman_drugs',
#                         'num_perp_drugs',
#                        ])

In [38]:
dfb.shape

(497, 29)

In [39]:
dfa.reassault.value_counts()

1.0    235
0.0    111
Name: reassault, dtype: int64

## _Merge dfa and dfb on ID._

In [40]:
dfb = dfa.merge(right=dfb, on='id')

In [41]:
dfb.shape

(497, 61)

In [42]:
dfb.head()

Unnamed: 0,id,abuse_past_year,abuse_status,length_relationship,partner_abusive,num_abusers,num_children,pregnant,beaten_while_pregnant,support_score,...,num_forced_sex,miscarriage_resulted,restrained_by_perp,num_choked,num_perp_drunk,num_woman_drunk,num_both_drunk,num_perp_drugs,num_woman_drugs,num_both_drugs
0,108395,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,108397,1,1,2,1,1,0,0,0,11,...,0,0,0,1,2,0,0,0,0,0
2,108399,1,1,5,1,1,2,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,108443,1,1,2,1,1,0,1,0,12,...,0,0,0,1,1,0,0,0,0,0
4,108444,1,1,2,1,1,1,0,0,11,...,0,0,0,0,3,0,0,0,0,0


In [43]:
dfb.columns

Index(['id', 'abuse_past_year', 'abuse_status', 'length_relationship',
       'partner_abusive', 'num_abusers', 'num_children', 'pregnant',
       'beaten_while_pregnant', 'support_score', 'limit_family_contact',
       'location_tracking', 'thrown_object', 'push_shove', 'slap',
       'kick_punch', 'hit_object', 'power_scale', 'harass_scale', 'id_age',
       'age_disparity', 'children_not_partner', 'same_sex_relationship',
       'threat_suicide', 'partner_reported_child_abuse', 'partner_arrested',
       'violence_increased', 'severity_increased', 'rape_ever',
       'controlled_ever', 'jealous', 'capable_murder', 'reassault',
       'gun_fired', 'anyone_high', 'safe_place', 'forced_illegal',
       'life_danger', 'talk_about_it', 'left_or_not', 'medical_staff_helpful',
       'perp_arrested_ever', 'police_resp', 'order_protection',
       'level_severity', 'num_incidents', 'num_threats', 'num_slapping',
       'num_punching', 'num_beating', 'num_weapon', 'num_forced_sex',
       'm

In [44]:
dfb.head(5).T

Unnamed: 0,0,1,2,3,4
id,108395.0,108397.0,108399.0,108443.0,108444.0
abuse_past_year,1.0,1.0,1.0,1.0,1.0
abuse_status,1.0,1.0,1.0,1.0,1.0
length_relationship,1.0,2.0,5.0,2.0,2.0
partner_abusive,1.0,1.0,1.0,1.0,1.0
num_abusers,1.0,1.0,1.0,1.0,1.0
num_children,1.0,0.0,2.0,0.0,1.0
pregnant,0.0,0.0,0.0,1.0,0.0
beaten_while_pregnant,0.0,0.0,0.0,0.0,0.0
support_score,0.0,11.0,0.0,12.0,11.0


In [45]:
missing_values_col(dfb)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,nan_count,nan_percentage
id,0,0.0,0,0.0,0,0.0
abuse_past_year,0,0.0,0,0.0,0,0.0
abuse_status,0,0.0,0,0.0,0,0.0
length_relationship,0,0.0,0,0.0,0,0.0
partner_abusive,0,0.0,0,0.0,0,0.0
num_abusers,0,0.0,0,0.0,0,0.0
num_children,0,0.0,0,0.0,0,0.0
pregnant,0,0.0,0,0.0,0,0.0
beaten_while_pregnant,0,0.0,0,0.0,0,0.0
support_score,0,0.0,0,0.0,0,0.0


In [46]:
dfb.reassault.value_counts()

1.0    235
0.0    111
Name: reassault, dtype: int64

## Train / Test Split

In [49]:
np.any(np.isnan(dfb)) # True
# np.all(np.isfinite(dfb)) # False

True

## _151 NaNs in reassault column?!_

In [48]:
# b_columns = ['',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#              '',
#             ]

# X = dfb.drop(columns=['reassault'])
# y = dfb[['reassault']]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .15, random_state = 0, stratify=y)

# train_df = pd.concat([X_train, y_train], axis=1)
# test_df = pd.concat([X_test, y_test], axis=1)

# EXPLORATION