# PLAN

0. [x] Get the data.
0. [x] Prepare the data for exploration.
0. [x] Split the data into train and test sets.
0. [ ] Explore the data.
0. [ ] Do cross-validation with the data.
0. [ ] Code and evaluate several models.
0. [ ] Evaluate out of sample data (test set).

# ENVIRONMENT

In [2]:
import os
import acquire
import prepare

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

# conda install -c glemaitre imbalanced-learn
from imblearn.ensemble import BalancedBaggingClassifier

# ACQUISITION

In [3]:
# read in data from separate csvs
df1 = acquire.read_data('data01.csv')
df2 = acquire.read_data('data02.csv')
df3 = acquire.read_data('data03.csv')
df4 = acquire.read_data('data04.csv')
df5 = acquire.read_data('data05.csv')
df7 = acquire.read_data('data07.csv')

In [4]:
# list of dataframes before merge
dfs = [df1, df2, df3, df4, df5, df7]

In [5]:
# Specify columns to take from dataframes 1, 2, 3, 4, 5, and 7:


one_cols = ['CASEID',
            'ABUSED',
            'SCRSTATR',
            'LENGTHC1',
            'C1SITUAT',
            'PABUSE',
             ]


two_cols = ['CASEID',
            'D3RCHILT',
           ]

three_cols = ['CASEID',
              'E13PRGNT',
              'N7PREGNT',
              'TOTSUPRT'
             ]

four_cols = ['CASEID',
             'G1NUMBER', 
             'H1JEALUS', 
             'H2LIMIT',
             'H3KNOWNG',
             'J1HIT',
             'J2THROWN',
             'J3PUSH',
             'J4SLAP',
             'J5KICK',
             'J6OBJECT',
             'J7BEAT',
             'J8CHOKE',
             'J9KNIFE',
             'J10GUN',
             'J11SEX',
             'POWER',
             'HARASS',
            ]

five_cols = ['CASEID', 
             'B1AGE',
             'AGEDISP',
             'STDETAI',
            ]

seven_cols = ['CASEID',
              'SAMESEXR',
              'N11DRUGS',
              'N12ALCHL',
              'N13SUHIM',
              'N16CHILD',
              'N17ARRST',
              'N1FRQNCY',
              'N2SVRITY',
              'N3WEAPON',
              'N4CHOKE',
              'N5SEX',
              'N6CONTRL',
              'N8JEALUS',
              'N10CPBLE',
             ]

In [6]:
# drop dataframes down to desired features
df1 = df1[one_cols]
df2 = df2[two_cols]
df3 = df3[three_cols]
df4 = df4[four_cols]
df5 = df5[five_cols]
df7 = df7[seven_cols]

In [7]:
#establish congruent shapes for merge
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
print(df5.shape)
print(df7.shape)

(705, 6)
(705, 2)
(705, 4)
(705, 18)
(705, 4)
(705, 15)


In [9]:
# do the merges
dfa = df1.merge(right=df2, on='CASEID')
dfa = dfa.merge(right=df3, on='CASEID')
dfa = dfa.merge(right=df4, on='CASEID')
dfa = dfa.merge(right=df5, on='CASEID')
dfa = dfa.merge(right=df7, on='CASEID')

In [10]:
# establish merged shape
dfa.shape

(705, 44)

In [11]:
# look at unaltered value counts
prepare.value_counts(dfa)

1    497
2    208
Name: ABUSED, dtype: int64
1      455
2      118
3      115
999     17
Name: SCRSTATR, dtype: int64
1    159
2    153
3     83
4    105
5    147
6     58
Name: LENGTHC1, dtype: int64
1    395
2     18
3     74
4     10
9    208
Name: C1SITUAT, dtype: int64
1    464
2     27
3      6
9    208
Name: PABUSE, dtype: int64
0      339
1      136
2      113
3       57
4       32
5        9
6        9
7        5
12       1
555      1
666      1
999      2
Name: D3RCHILT, dtype: int64
1       74
2      127
3      492
888      9
999      3
Name: E13PRGNT, dtype: int64
1      130
2      303
3       52
9      208
777      5
999      7
Name: N7PREGNT, dtype: int64
0        9
1       10
2       16
3       27
4       20
5       28
6       35
7       47
8       57
9       68
10      98
11     134
12     154
999      2
Name: TOTSUPRT, dtype: int64
1        22
2         6
3         2
7         1
15        1
9999    654
888       3
999      16
Name: G1NUMBER, dtype: int64
1      487
2  

In [12]:
# change dfa to df to match conventions of prepare.py
df = dfa
#use prepare function to rename features 
prepare.rename_columns(df)

In [13]:
# ensure renaming of features worked
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 705 entries, 0 to 704
Data columns (total 44 columns):
id                              705 non-null int64
abuse_past_year                 705 non-null int64
abuse_status                    705 non-null int64
length_relationship             705 non-null int64
partner_abusive                 705 non-null int64
num_abusers                     705 non-null int64
num_children                    705 non-null int64
pregnant                        705 non-null int64
beaten_while_pregnant           705 non-null int64
support_score                   705 non-null int64
guns_in_home                    705 non-null int64
jealous_past_year               705 non-null int64
limit_fam_contact               705 non-null int64
location_tracking               705 non-null int64
threat_hit                      705 non-null int64
thrown_obj                      705 non-null int64
push_shove                      705 non-null int64
slap                        

In [14]:
# run function to replace nonvalues and change values to conventionally boolean archetypes
prepare.replace_nonvals(df)

In [15]:
# check value counts to establish things worked
prepare.value_counts(df)

0    208
1    497
Name: abuse_past_year, dtype: int64
0    250
1    455
Name: abuse_status, dtype: int64
1    159
2    153
3     83
4    105
5    147
6     58
Name: length_relationship, dtype: int64
0    300
1    405
Name: partner_abusive, dtype: int64
1    672
2     33
Name: num_abusers, dtype: int64
0      339
1      136
2      226
555      1
666      1
999      2
Name: num_children, dtype: int64
0    631
1     74
Name: pregnant, dtype: int64
0    575
1    130
Name: beaten_while_pregnant, dtype: int64
0     122
1      30
5      28
6      35
7      47
8      57
10     98
11    134
12    154
Name: support_score, dtype: int64
0    681
1     24
Name: guns_in_home, dtype: int64
0    217
1    488
Name: jealous_past_year, dtype: int64
0    368
1    337
Name: limit_fam_contact, dtype: int64
0    242
1    463
Name: location_tracking, dtype: int64
0    330
1    375
Name: threat_hit, dtype: int64
0    469
1    236
Name: thrown_obj, dtype: int64
0    268
1    437
Name: push_shove, dtype: int64
0