### Inspecting a Dataframe

In [12]:
import pandas as pd
import numpy as np

In [13]:
homelessness = pd.read_csv("datasets/homelessness.csv", index_col=0)

In [14]:
homelessness.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570.0,864.0,4887681
1,Pacific,Alaska,1434.0,582.0,735139
2,Mountain,Arizona,7259.0,2606.0,7158024
3,West South Central,Arkansas,2280.0,432.0,3009733
4,Pacific,California,109008.0,20964.0,39461588


In [15]:
homelessness.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   region          51 non-null     object 
 1   state           51 non-null     object 
 2   individuals     51 non-null     float64
 3   family_members  51 non-null     float64
 4   state_pop       51 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 2.4+ KB


In [16]:
homelessness.shape

(51, 5)

In [17]:
homelessness.describe()

Unnamed: 0,individuals,family_members,state_pop
count,51.0,51.0,51.0
mean,7225.784314,3504.882353,6405637.0
std,15991.025083,7805.411811,7327258.0
min,434.0,75.0,577601.0
25%,1446.5,592.0,1777414.0
50%,3082.0,1482.0,4461153.0
75%,6781.5,3196.0,7340946.0
max,109008.0,52070.0,39461590.0


### Parts of a DataFrame

In [21]:
homelessness.values
# A two-dimensional numpy array of values

array([['East South Central', 'Alabama', 2570.0, 864.0, 4887681],
       ['Pacific', 'Alaska', 1434.0, 582.0, 735139],
       ['Mountain', 'Arizona', 7259.0, 2606.0, 7158024],
       ['West South Central', 'Arkansas', 2280.0, 432.0, 3009733],
       ['Pacific', 'California', 109008.0, 20964.0, 39461588],
       ['Mountain', 'Colorado', 7607.0, 3250.0, 5691287],
       ['New England', 'Connecticut', 2280.0, 1696.0, 3571520],
       ['South Atlantic', 'Delaware', 708.0, 374.0, 965479],
       ['South Atlantic', 'District of Columbia', 3770.0, 3134.0, 701547],
       ['South Atlantic', 'Florida', 21443.0, 9587.0, 21244317],
       ['South Atlantic', 'Georgia', 6943.0, 2556.0, 10511131],
       ['Pacific', 'Hawaii', 4131.0, 2399.0, 1420593],
       ['Mountain', 'Idaho', 1297.0, 715.0, 1750536],
       ['East North Central', 'Illinois', 6752.0, 3891.0, 12723071],
       ['East North Central', 'Indiana', 3776.0, 1482.0, 6695497],
       ['West North Central', 'Iowa', 1711.0, 1038.0, 3148618]

In [23]:
homelessness.columns

Index(['region', 'state', 'individuals', 'family_members', 'state_pop'], dtype='object')

In [24]:
homelessness.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
            50],
           dtype='int64')

### Sorting Rows

In [26]:
homelessness_ind = homelessness.sort_values(by="individuals")
homelessness_ind.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
50,Mountain,Wyoming,434.0,205.0,577601
34,West North Central,North Dakota,467.0,75.0,758080
7,South Atlantic,Delaware,708.0,374.0,965479
39,New England,Rhode Island,747.0,354.0,1058287
45,New England,Vermont,780.0,511.0,624358


In [27]:
homelessness_fam = homelessness.sort_values(by="family_members", ascending=False)
homelessness_fam.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
4,Pacific,California,109008.0,20964.0,39461588
21,New England,Massachusetts,6811.0,13257.0,6882635
9,South Atlantic,Florida,21443.0,9587.0,21244317
43,West South Central,Texas,19199.0,6111.0,28628666


In [29]:
homelessness_reg_fam = homelessness.sort_values(by=["region", "family_members"],  ascending=[True, False])
homelessness_reg_fam.head(10)

Unnamed: 0,region,state,individuals,family_members,state_pop
13,East North Central,Illinois,6752.0,3891.0,12723071
35,East North Central,Ohio,6929.0,3320.0,11676341
22,East North Central,Michigan,5209.0,3142.0,9984072
49,East North Central,Wisconsin,2740.0,2167.0,5807406
14,East North Central,Indiana,3776.0,1482.0,6695497
42,East South Central,Tennessee,6139.0,1744.0,6771631
17,East South Central,Kentucky,2735.0,953.0,4461153
0,East South Central,Alabama,2570.0,864.0,4887681
24,East South Central,Mississippi,1024.0,328.0,2981020
32,Mid-Atlantic,New York,39827.0,52070.0,19530351


### Subsetting Columns

In [30]:
individuals = homelessness["individuals"]
individuals.head()

0      2570.0
1      1434.0
2      7259.0
3      2280.0
4    109008.0
Name: individuals, dtype: float64

In [32]:
state_fam = homelessness[["state", "family_members"]]
state_fam.head()

Unnamed: 0,state,family_members
0,Alabama,864.0
1,Alaska,582.0
2,Arizona,2606.0
3,Arkansas,432.0
4,California,20964.0


In [33]:
ind_state = homelessness[["individuals", "state"]]
ind_state.head()

Unnamed: 0,individuals,state
0,2570.0,Alabama
1,1434.0,Alaska
2,7259.0,Arizona
3,2280.0,Arkansas
4,109008.0,California


### Subsetting Rows

In [36]:
homelessness[homelessness["individuals"] > 10000]

Unnamed: 0,region,state,individuals,family_members,state_pop
4,Pacific,California,109008.0,20964.0,39461588
9,South Atlantic,Florida,21443.0,9587.0,21244317
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
37,Pacific,Oregon,11139.0,3337.0,4181886
43,West South Central,Texas,19199.0,6111.0,28628666
47,Pacific,Washington,16424.0,5880.0,7523869


In [38]:
homelessness[homelessness["region"] == "Mountain"]

Unnamed: 0,region,state,individuals,family_members,state_pop
2,Mountain,Arizona,7259.0,2606.0,7158024
5,Mountain,Colorado,7607.0,3250.0,5691287
12,Mountain,Idaho,1297.0,715.0,1750536
26,Mountain,Montana,983.0,422.0,1060665
28,Mountain,Nevada,7058.0,486.0,3027341
31,Mountain,New Mexico,1949.0,602.0,2092741
44,Mountain,Utah,1904.0,972.0,3153550
50,Mountain,Wyoming,434.0,205.0,577601


In [39]:
homelessness[(homelessness["region"] == "Pacific") & (homelessness["family_members"] < 1000)]

Unnamed: 0,region,state,individuals,family_members,state_pop
1,Pacific,Alaska,1434.0,582.0,735139


### Subsetting Rows by Categorical Variables

In [42]:
homelessness[homelessness["region"].isin(["South Atlantic", "Mid-Atlantic"])]

Unnamed: 0,region,state,individuals,family_members,state_pop
7,South Atlantic,Delaware,708.0,374.0,965479
8,South Atlantic,District of Columbia,3770.0,3134.0,701547
9,South Atlantic,Florida,21443.0,9587.0,21244317
10,South Atlantic,Georgia,6943.0,2556.0,10511131
20,South Atlantic,Maryland,4914.0,2230.0,6035802
30,Mid-Atlantic,New Jersey,6048.0,3350.0,8886025
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
33,South Atlantic,North Carolina,6451.0,2817.0,10381615
38,Mid-Atlantic,Pennsylvania,8163.0,5349.0,12800922
40,South Atlantic,South Carolina,3082.0,851.0,5084156


In [45]:
state_list = ["California", "Arizona", "Nevada", "Utah"]
homelessness[homelessness["state"].isin(state_list)]

Unnamed: 0,region,state,individuals,family_members,state_pop
2,Mountain,Arizona,7259.0,2606.0,7158024
4,Pacific,California,109008.0,20964.0,39461588
28,Mountain,Nevada,7058.0,486.0,3027341
44,Mountain,Utah,1904.0,972.0,3153550


### Adding New Columns

In [49]:
homelessness["total"] = homelessness["individuals"] + homelessness["family_members"]
homelessness["total"].head()

0      3434.0
1      2016.0
2      9865.0
3      2712.0
4    129972.0
Name: total, dtype: float64

In [50]:
homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]
homelessness["p_individuals"].head()

0    0.748398
1    0.711310
2    0.735834
3    0.840708
4    0.838704
Name: p_individuals, dtype: float64

In [54]:
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessness["state_pop"] 
homelessness["indiv_per_10k"].head()

0     5.258117
1    19.506515
2    10.141067
3     7.575423
4    27.623825
Name: indiv_per_10k, dtype: float64

In [55]:
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
high_homelessness.head()

Unnamed: 0,region,state,individuals,family_members,state_pop,total,p_individuals,indiv_per_10k
4,Pacific,California,109008.0,20964.0,39461588,129972.0,0.838704,27.623825
8,South Atlantic,District of Columbia,3770.0,3134.0,701547,6904.0,0.54606,53.738381
11,Pacific,Hawaii,4131.0,2399.0,1420593,6530.0,0.632619,29.079406
28,Mountain,Nevada,7058.0,486.0,3027341,7544.0,0.935578,23.314189
32,Mid-Atlantic,New York,39827.0,52070.0,19530351,91897.0,0.433387,20.392363


In [56]:
high_homelessness_srt = high_homelessness.sort_values(by="indiv_per_10k", ascending=False)
high_homelessness_srt

Unnamed: 0,region,state,individuals,family_members,state_pop,total,p_individuals,indiv_per_10k
8,South Atlantic,District of Columbia,3770.0,3134.0,701547,6904.0,0.54606,53.738381
11,Pacific,Hawaii,4131.0,2399.0,1420593,6530.0,0.632619,29.079406
4,Pacific,California,109008.0,20964.0,39461588,129972.0,0.838704,27.623825
37,Pacific,Oregon,11139.0,3337.0,4181886,14476.0,0.769481,26.636307
28,Mountain,Nevada,7058.0,486.0,3027341,7544.0,0.935578,23.314189
47,Pacific,Washington,16424.0,5880.0,7523869,22304.0,0.73637,21.829195
32,Mid-Atlantic,New York,39827.0,52070.0,19530351,91897.0,0.433387,20.392363


In [57]:
result = high_homelessness_srt[["state", "indiv_per_10k"]]
result

Unnamed: 0,state,indiv_per_10k
8,District of Columbia,53.738381
11,Hawaii,29.079406
4,California,27.623825
37,Oregon,26.636307
28,Nevada,23.314189
47,Washington,21.829195
32,New York,20.392363
