In [1]:
import pandas as pd
import numpy as np

In [2]:
filepath=r"C:\Users\gdg13\OneDrive\Desktop\data_science\data-USstates-master\state-population.csv"
pop=pd.read_csv(filepath)
filepath1=r'C:\Users\gdg13\OneDrive\Desktop\data_science\data-USstates-master\state-areas.csv'
areas=pd.read_csv(filepath1)
filepath2=r'C:\Users\gdg13\OneDrive\Desktop\data_science\data-USstates-master\state-abbrevs.csv'
abbrevs=pd.read_csv(filepath2)

In [3]:
print(pop.head()); print(areas.head()); print(abbrevs.head())

  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA


Given this information, say we want to compute a relatively straightforward result: rank US states and territories by their 2010 population density. We clearly have the data here to find this result, but we’ll have to combine the datasets to get it.

We’ll start with a many-to-one merge that will give us the full state name within the population DataFrame. We want to merge based on the state/region column of pop, and the abbreviation column of abbrevs. We’ll use how='outer' to make sure no data is thrown away due to mismatched labels.

In [4]:
merged=pd.merge(pop,abbrevs,left_on='state/region',right_on='abbreviation',how='outer') 
# outer suggests that all data is to be shown even if it contains NaN value somewhere 

In [5]:
merged.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489.0,Alabama,AL
1,AL,total,2012,4817528.0,Alabama,AL
2,AL,under18,2010,1130966.0,Alabama,AL
3,AL,total,2010,4785570.0,Alabama,AL
4,AL,under18,2011,1125763.0,Alabama,AL


In [6]:
merged=merged.drop('abbreviation',axis=1) # head gives the first five rows as output
merged.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


Let’s double-check whether there were any mismatches here, which we can do by
looking for rows with nulls

In [7]:
merged.isnull().any() # There are some missing value in population and state columns as stated below

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [8]:
merged[merged['population'].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,


In [9]:
merged['population'].isna().head()
# merged['population'] just represents population coulmn of merged dataframe, isna() asks if there is any null value 
# in this coulmn, if so which are those columns , .head() shows 1st five values of merged ['populatiton'] with true and false 
# value for null or notnull.

0    False
1    False
2    False
3    False
4    False
Name: population, dtype: bool

It appears that all the null population values are from Puerto Rico prior to the year
2000; this is likely due to this data not being available from the original source

More importantly, we see also that some of the new state entries are also null, which
means that there was no corresponding entry in the abbrevs key! Let’s figure out
which regions lack this match

In [10]:
merged.loc[merged['state'].isnull()] # This tells that state column is empty for both PR and USA

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,
...,...,...,...,...,...
2539,USA,total,2010,309326295.0,
2540,USA,under18,2011,73902222.0,
2541,USA,total,2011,311582564.0,
2542,USA,under18,2012,73708179.0,


In [11]:
merged.loc[merged['state'].isnull(), 'state/region'].unique() 
# This asks for those values in state/region which are depicted as NaN in state column

array(['PR', 'USA'], dtype=object)

In [12]:
merged.loc[merged['state/region']=='PR',"state"]='Puerto Rico'
merged.loc[merged['state/region']=='USA','state']='United States'
merged

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama
...,...,...,...,...,...
2539,USA,total,2010,309326295.0,United States
2540,USA,under18,2011,73902222.0,United States
2541,USA,total,2011,311582564.0,United States
2542,USA,under18,2012,73708179.0,United States


In [13]:
merged.isnull().any() # Now only population data is left unfilled

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

In [18]:
merged.loc[merged['population'].isnull(),'state'].unique()  # This states that the population data for puerto rico is missing

array(['Puerto Rico'], dtype=object)

Now we can merge the result with the area data using a similar procedure. Examining
our results, we will want to join on the state column in both

In [21]:
final=pd.merge(merged,areas,on='state',how='left')
final

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0
...,...,...,...,...,...,...
2539,USA,total,2010,309326295.0,United States,
2540,USA,under18,2011,73902222.0,United States,
2541,USA,total,2011,311582564.0,United States,
2542,USA,under18,2012,73708179.0,United States,


In [22]:
pd.merge?

In [23]:
final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

There are nulls in the area column; we can take a look to see which regions were
ignored here

In [24]:
final['state'][final['area (sq. mi)'].isnull()].unique() # both gives same output .
#final.loc[final['area (sq. mi)'].isnull(),'state'].unique() 

array(['United States'], dtype=object)

In [25]:
final.dropna(inplace=True)
final

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0
...,...,...,...,...,...,...
2491,PR,under18,2010,896945.0,Puerto Rico,3515.0
2492,PR,under18,2011,869327.0,Puerto Rico,3515.0
2493,PR,total,2011,3686580.0,Puerto Rico,3515.0
2494,PR,under18,2012,841740.0,Puerto Rico,3515.0


In [26]:
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423.0
91,AK,total,2010,713868.0,Alaska,656425.0
101,AZ,total,2010,6408790.0,Arizona,114006.0
189,AR,total,2010,2922280.0,Arkansas,53182.0
197,CA,total,2010,37333601.0,California,163707.0


In [27]:
data2010_1=final[(final['year']==2010)&(final['ages']=='total')]
data2010_1.head() # both above and this command gives the same output

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423.0
91,AK,total,2010,713868.0,Alaska,656425.0
101,AZ,total,2010,6408790.0,Arizona,114006.0
189,AR,total,2010,2922280.0,Arkansas,53182.0
197,CA,total,2010,37333601.0,California,163707.0


In [28]:
data2010.set_index('state',inplace=True) # I have assigned state column as index , and if i rerun the command then it leads to error
#  because there is no column named state left in the data2010 as state has already been converted to index.
# The inplace= True keyword was creating the problem
# also removing the inplace , removes the index in the next command and natural indices are displaced
density=data2010['population']/data2010['area (sq. mi)']
density.head()

state
Alabama        91.287603
Alaska          1.087509
Arizona        56.214497
Arkansas       54.948667
California    228.051342
dtype: float64

In [29]:
density.sort_values(ascending=False, inplace=True)
density.head()


state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [30]:
density.tail()

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64