# USA State Homelessness Data Analysis

<p> Homelessness data containing estimates of homelessness in each U.S. state in 2018</p>
<p> The individual column is the number of homeless individuals who are not part of a family with children.</p>
<p> The family_members column is the number of homeless individuals part of a family with children. </p>
<p> The state_pop column is the state's total population.</p>

# Importing Library

In [1]:
import pandas as pd

# Data Loading

In [7]:
homedata = pd.read_excel('homelessness.xlsx')

# Data Manipulation

In [8]:
homedata.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570,864,4887681
1,Pacific,Alaska,1434,582,735139
2,Mountain,Arizona,7259,2606,7158024
3,West South Central,Arkansas,2280,432,3009733
4,Pacific,California,109008,20964,39461588


In [9]:
homedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   region          51 non-null     object
 1   state           51 non-null     object
 2   individuals     51 non-null     int64 
 3   family_members  51 non-null     int64 
 4   state_pop       51 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 2.1+ KB


In [10]:
homedata.shape

(51, 5)

In [11]:
homedata.describe()

Unnamed: 0,individuals,family_members,state_pop
count,51.0,51.0,51.0
mean,7225.784314,3504.882353,6405637.0
std,15991.025083,7805.411811,7327258.0
min,434.0,75.0,577601.0
25%,1446.5,592.0,1777414.0
50%,3082.0,1482.0,4461153.0
75%,6781.5,3196.0,7340946.0
max,109008.0,52070.0,39461590.0


# 2D NumPy array of the data


In [14]:
homedata.values

array([['East South Central', 'Alabama', 2570, 864, 4887681],
       ['Pacific', 'Alaska', 1434, 582, 735139],
       ['Mountain', 'Arizona', 7259, 2606, 7158024],
       ['West South Central', 'Arkansas', 2280, 432, 3009733],
       ['Pacific', 'California', 109008, 20964, 39461588],
       ['Mountain', 'Colorado', 7607, 3250, 5691287],
       ['New England', 'Connecticut', 2280, 1696, 3571520],
       ['South Atlantic', 'Delaware', 708, 374, 965479],
       ['South Atlantic', 'District of Columbia', 3770, 3134, 701547],
       ['South Atlantic', 'Florida', 21443, 9587, 21244317],
       ['South Atlantic', 'Georgia', 6943, 2556, 10511131],
       ['Pacific', 'Hawaii', 4131, 2399, 1420593],
       ['Mountain', 'Idaho', 1297, 715, 1750536],
       ['East North Central', 'Illinois', 6752, 3891, 12723071],
       ['East North Central', 'Indiana', 3776, 1482, 6695497],
       ['West North Central', 'Iowa', 1711, 1038, 3148618],
       ['West North Central', 'Kansas', 1443, 773, 2911359],
 

# Checking for index and columns

In [15]:
homedata.index

RangeIndex(start=0, stop=51, step=1)

In [16]:
homedata.columns

Index(['region', 'state', 'individuals', 'family_members', 'state_pop'], dtype='object')

# SORTING AND SUBSETTING

# Sorting homelessness  data by the number of homeless individuals, from smallest to largest

In [17]:
homedata_ind = homedata.sort_values('individuals', ascending = True)

In [19]:
homedata_ind.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
50,Mountain,Wyoming,434,205,577601
34,West North Central,North Dakota,467,75,758080
7,South Atlantic,Delaware,708,374,965479
39,New England,Rhode Island,747,354,1058287
45,New England,Vermont,780,511,624358


# Sorting homelessness data by the number of homeless family_members in descending order.

In [20]:
homedata_fam = homedata.sort_values('family_members', ascending= False)

In [21]:
homedata_fam.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
32,Mid-Atlantic,New York,39827,52070,19530351
4,Pacific,California,109008,20964,39461588
21,New England,Massachusetts,6811,13257,6882635
9,South Atlantic,Florida,21443,9587,21244317
43,West South Central,Texas,19199,6111,28628666


# Sorting homelessness first by region (ascending), and then by number of family members (descending).

In [22]:
homedata_reg_fam = homedata.sort_values(['region','family_members'], ascending=[True, False])

In [23]:
homedata_reg_fam.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
13,East North Central,Illinois,6752,3891,12723071
35,East North Central,Ohio,6929,3320,11676341
22,East North Central,Michigan,5209,3142,9984072
49,East North Central,Wisconsin,2740,2167,5807406
14,East North Central,Indiana,3776,1482,6695497


# Creating a DataFrame called individuals that contains only the individuals column of homelessness.

In [24]:
individuals = homedata['individuals']

In [26]:
individuals.head()

0      2570
1      1434
2      7259
3      2280
4    109008
Name: individuals, dtype: int64

# Creating a DataFrame called state_fam that contains only the state and family_members columns of homelessness, in that order.

In [27]:
state_fam = homedata[['state','family_members']]

In [28]:
state_fam.head()

Unnamed: 0,state,family_members
0,Alabama,864
1,Alaska,582
2,Arizona,2606
3,Arkansas,432
4,California,20964


# Creating a DataFrame called ind_state that contains the individuals and state columns of homelessness, in that order.

In [29]:
ind_state = homedata[['individuals','state']]

In [30]:
ind_state.head()

Unnamed: 0,individuals,state
0,2570,Alabama
1,1434,Alaska
2,7259,Arizona
3,2280,Arkansas
4,109008,California


# FILTERING OR SELECTING ROWS

### Filter homelessness for cases where the number of individuals is greater than ten thousand.

In [33]:
homedata_ind_10k = homedata[homedata['individuals'] > 10000]

In [34]:
homedata_ind_10k.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
4,Pacific,California,109008,20964,39461588
9,South Atlantic,Florida,21443,9587,21244317
32,Mid-Atlantic,New York,39827,52070,19530351
37,Pacific,Oregon,11139,3337,4181886
43,West South Central,Texas,19199,6111,28628666


### Filter homelessness for cases where the USA Census region is "Mountain"

In [35]:
homedata_mount = homedata[homedata['region']== 'Mountain']

In [36]:
homedata_mount.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
2,Mountain,Arizona,7259,2606,7158024
5,Mountain,Colorado,7607,3250,5691287
12,Mountain,Idaho,1297,715,1750536
26,Mountain,Montana,983,422,1060665
28,Mountain,Nevada,7058,486,3027341


### Filter homelessness for cases where the number of family_members is less than one thousand and the region is "Pacific".

In [38]:
homedata_fam_1k_reg = homedata[(homedata['family_members'] < 1000) & (homedata['region'] == 'Pacific')]

In [39]:
homedata_fam_1k_reg.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
1,Pacific,Alaska,1434,582,735139


# Subsetting rows by categorical variables

### Filter homelessness for cases where the USA census region is "South Atlantic" or it is "Mid-Atlantic".

In [40]:
homedata_atlan_mid = homedata[(homedata['region'] == 'South Atlantic') | (homedata['region'] == 'Mid-Atlantic')]

In [41]:
homedata_atlan_mid.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
7,South Atlantic,Delaware,708,374,965479
8,South Atlantic,District of Columbia,3770,3134,701547
9,South Atlantic,Florida,21443,9587,21244317
10,South Atlantic,Georgia,6943,2556,10511131
20,South Atlantic,Maryland,4914,2230,6035802


### Filter homelessness for cases where the USA census state is in the list of Mojave states, canu.

In [45]:
canu = ["California", "Arizona", "Nevada", "Utah"]
homedata_mojave = homedata[homedata["state"].isin(canu)]


In [46]:
homedata_mojave.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
2,Mountain,Arizona,7259,2606,7158024
4,Pacific,California,109008,20964,39461588
28,Mountain,Nevada,7058,486,3027341
44,Mountain,Utah,1904,972,3153550


# ADDING NEW COLUMN (Mutating, Feature Engineering or Transforming)

### Add a new column to homelessness, named total, containing the sum of the individuals and family_members columns.

In [48]:
homedata['total'] = homedata['individuals'] + homedata['family_members']

In [49]:
homedata.head()

Unnamed: 0,region,state,individuals,family_members,state_pop,total
0,East South Central,Alabama,2570,864,4887681,3434
1,Pacific,Alaska,1434,582,735139,2016
2,Mountain,Arizona,7259,2606,7158024,9865
3,West South Central,Arkansas,2280,432,3009733,2712
4,Pacific,California,109008,20964,39461588,129972


### Add another column to homelessness, named p_individuals, containing the proportion of homeless people in each state who are individuals.

In [50]:
homedata['p_individuals'] = homedata['individuals'] / homedata['total']

In [51]:
homedata.head()

Unnamed: 0,region,state,individuals,family_members,state_pop,total,p_individuals
0,East South Central,Alabama,2570,864,4887681,3434,0.748398
1,Pacific,Alaska,1434,582,735139,2016,0.71131
2,Mountain,Arizona,7259,2606,7158024,9865,0.735834
3,West South Central,Arkansas,2280,432,3009733,2712,0.840708
4,Pacific,California,109008,20964,39461588,129972,0.838704


### Which state has the highest number of homeless individuals per 10,000 people in the state

In [52]:
# Create indiv_per_10k col as homeless individuals per 10k state pop
homedata["indiv_per_10k"] = 10000 * (homedata["individuals"]) / (homedata["state_pop"])

# Subset rows for indiv_per_10k greater than 20
high_homedata = homedata[homedata["indiv_per_10k"] > 20]

# Sort high_homelessness by descending indiv_per_10k
high_homedata_srt = high_homedata.sort_values("indiv_per_10k", ascending=False)

# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homedata_srt[["state", "indiv_per_10k"]]


In [53]:
result

Unnamed: 0,state,indiv_per_10k
8,District of Columbia,53.738381
11,Hawaii,29.079406
4,California,27.623825
37,Oregon,26.636307
28,Nevada,23.314189
47,Washington,21.829195
32,New York,20.392363
