In [26]:
# Import libraries
import pandas as pd

# Establish URL
url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_GDP#GDP_by_state'

# Read the website and obtain tables
tables = pd.read_html(url)

In [27]:
# Creates a Dataframes using the first table
wiki_data = pd.DataFrame(tables[0])

wiki_data.head()

Unnamed: 0_level_0,State or federal district,Nominal GDPat current prices 2022(millions ofU.S. dollars)[1],Nominal GDPat current prices 2022(millions ofU.S. dollars)[1],Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1],Real GDP growthrate (2021–2022)[1],Nominal GDP per capita 2022[1][3],Nominal GDP per capita 2022[1][3],% of national[1],% of national[1]
Unnamed: 0_level_1,State or federal district,2022,2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],2022,2023,2022,2021
0,,,,,,,,,,
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,"$92,190","$96,222",14.69%,14.49%
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,"$78,456","$81,130",8.69%,8.55%
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,"$104,344","$108,380",8.11%,8.31%
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,"$62,446","$65,390",5.37%,5.34%


In [28]:
wiki_data.columns

MultiIndex([(                                    'State or federal district', ...),
            ('Nominal GDPat current prices 2022(millions ofU.S. dollars)[1]', ...),
            ('Nominal GDPat current prices 2022(millions ofU.S. dollars)[1]', ...),
            (            'Annual GDP changeat current prices 2022(21–22)[1]', ...),
            (            'Annual GDP changeat current prices 2022(21–22)[1]', ...),
            (                           'Real GDP growthrate (2021–2022)[1]', ...),
            (                            'Nominal GDP per capita 2022[1][3]', ...),
            (                            'Nominal GDP per capita 2022[1][3]', ...),
            (                                             '% of national[1]', ...),
            (                                             '% of national[1]', ...)],
           )

**Step 1**

In [29]:
"""
As seen above the columns are in a multilevel index. Joining the data to the other data sets will be easier
if they have the same amount of levels.
To do that, will use the DropLevel, and index the first level to remove it.
"""
# Drops first level
wiki_data.columns = wiki_data.columns.droplevel(0)
wiki_data.head()

Unnamed: 0,State or federal district,2022,2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],2022.1,2023.1,2022.2,2021
0,,,,,,,,,,
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,"$92,190","$96,222",14.69%,14.49%
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,"$78,456","$81,130",8.69%,8.55%
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,"$104,344","$108,380",8.11%,8.31%
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,"$62,446","$65,390",5.37%,5.34%


In [30]:
"""
With that, the columns are all in one level and we can rename the columns as needed. 
"""
wiki_data.columns

Index(['State or federal district', '2022', '2023',
       'Annual GDP changeat current prices 2022(21–22)[1]',
       'Annual GDP changeat current prices 2022(21–22)[1].1',
       'Real GDP growthrate (2021–2022)[1]', '2022', '2023', '2022', '2021'],
      dtype='object')

**Step 2**

In [31]:
"""
Now that we have removed the top index layer, the first column is all NaN values.
We can use the dropna function to drop all nulls from the data. 
"""
# Removes any NaN values
wiki_data = wiki_data.dropna()
wiki_data.head()

Unnamed: 0,State or federal district,2022,2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],2022.1,2023.1,2022.2,2021
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,"$92,190","$96,222",14.69%,14.49%
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,"$78,456","$81,130",8.69%,8.55%
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,"$104,344","$108,380",8.11%,8.31%
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,"$62,446","$65,390",5.37%,5.34%
5,Illinois *,1033310.0,1071552.0,87636.0,9.3%,5.0%,"$82,126","$85,111",4.11%,4.13%


**Step 3**

In [32]:
"""
The last 2 columns of this data provide % of national which will not be needed for this analysis. 
These two columns can be dropped using iloc.
"""
# Removes the last 2 columns
wiki_data = wiki_data.iloc[:, :-2]
wiki_data.head()

Unnamed: 0,State or federal district,2022,2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],2022.1,2023.1
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,"$92,190","$96,222"
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,"$78,456","$81,130"
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,"$104,344","$108,380"
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,"$62,446","$65,390"
5,Illinois *,1033310.0,1071552.0,87636.0,9.3%,5.0%,"$82,126","$85,111"


**Step 4**

In [33]:
"""
Since the top index level was dropped in a previous we need to rename the last two columns to be 'Per Capita 2022' and 'Per Capita 2021'.
This can be completed with the rename funciton in pandas.
"""
# Renames 2022 and 2021
wiki_data = wiki_data.rename(columns={'2022': 'PerCapitaGDP_2022', '2021': 'PerCapitaGDP_2021'})
wiki_data.head()

Unnamed: 0,State or federal district,PerCapitaGDP_2022,2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],PerCapitaGDP_2022.1,2023.1
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,"$92,190","$96,222"
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,"$78,456","$81,130"
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,"$104,344","$108,380"
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,"$62,446","$65,390"
5,Illinois *,1033310.0,1071552.0,87636.0,9.3%,5.0%,"$82,126","$85,111"


**Step 5**

In [34]:
"""
There were two columns with the same name. The column in position 2 needs to be renamed to GDP 2022. 
This will done using column indexing so that the other column with the same name does not get changed again. 
"""
# Indexes second column and changes name
wiki_data.columns.values[1] = 'GDP_2022'
wiki_data.head()

Unnamed: 0,State or federal district,GDP_2022,2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],PerCapitaGDP_2022,2023.1
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,"$92,190","$96,222"
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,"$78,456","$81,130"
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,"$104,344","$108,380"
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,"$62,446","$65,390"
5,Illinois *,1033310.0,1071552.0,87636.0,9.3%,5.0%,"$82,126","$85,111"


**Step 5**

In [35]:
"""
Next, the third column, 2023, had another level above it which stated it was GDP for 2023. 
This column will need to be renamed to display the accurate information.
"""
# Indexes third column and changes name
wiki_data.columns.values[2] = 'GDP_2023'
wiki_data.head()

Unnamed: 0,State or federal district,GDP_2022,GDP_2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],PerCapitaGDP_2022,2023
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,"$92,190","$96,222"
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,"$78,456","$81,130"
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,"$104,344","$108,380"
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,"$62,446","$65,390"
5,Illinois *,1033310.0,1071552.0,87636.0,9.3%,5.0%,"$82,126","$85,111"


In [36]:
wiki_data = wiki_data.rename(columns={'2023': 'PerCapitaGDP_2023'})
wiki_data.head()

Unnamed: 0,State or federal district,GDP_2022,GDP_2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],PerCapitaGDP_2022,PerCapitaGDP_2023
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,"$92,190","$96,222"
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,"$78,456","$81,130"
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,"$104,344","$108,380"
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,"$62,446","$65,390"
5,Illinois *,1033310.0,1071552.0,87636.0,9.3%,5.0%,"$82,126","$85,111"


**Step 6**

In [37]:
"""
The last two columns have dollar signs and the others do not. 
To keep this consistent with the current table and the tables from the other datasets the $ and , will be removed.
This can be completed with str.replace function.
"""
# Removes $ from string 
wiki_data['PerCapitaGDP_2022'] = wiki_data['PerCapitaGDP_2022'].str.replace('$', '')
wiki_data['PerCapitaGDP_2023'] = wiki_data['PerCapitaGDP_2023'].str.replace('$', '')
# Removes , from string
wiki_data['PerCapitaGDP_2022'] = wiki_data['PerCapitaGDP_2022'].str.replace(',', '')
wiki_data['PerCapitaGDP_2023'] = wiki_data['PerCapitaGDP_2023'].str.replace(',', '')
wiki_data.head()

  wiki_data['PerCapitaGDP_2022'] = wiki_data['PerCapitaGDP_2022'].str.replace('$', '')
  wiki_data['PerCapitaGDP_2023'] = wiki_data['PerCapitaGDP_2023'].str.replace('$', '')


Unnamed: 0,State or federal district,GDP_2022,GDP_2023,Annual GDP changeat current prices 2022(21–22)[1],Annual GDP changeat current prices 2022(21–22)[1].1,Real GDP growthrate (2021–2022)[1],PerCapitaGDP_2022,PerCapitaGDP_2023
1,California *,3598103.0,3755487.0,224862.0,11.6%,7.8%,92190,96222
2,Texas *,2355960.0,2436346.0,304191.0,32.6%,5.6%,78456,81130
3,New York *,2053180.0,2135672.0,151883.0,7.5%,5.0%,104344,108380
4,Florida *,1389070.0,1468015.0,133482.0,10.9%,6.9%,62446,65390
5,Illinois *,1033310.0,1071552.0,87636.0,9.3%,5.0%,82126,85111


**Step 7**

In [38]:
"""
The middle three columns provide information that will not be used in this study and can be ommitted.
They can be removed using column indexing.
"""
# Drops middle three columns
wiki_data = wiki_data.drop(wiki_data.columns[3:6], axis=1)
wiki_data.head()

Unnamed: 0,State or federal district,GDP_2022,GDP_2023,PerCapitaGDP_2022,PerCapitaGDP_2023
1,California *,3598103.0,3755487.0,92190,96222
2,Texas *,2355960.0,2436346.0,78456,81130
3,New York *,2053180.0,2135672.0,104344,108380
4,Florida *,1389070.0,1468015.0,62446,65390
5,Illinois *,1033310.0,1071552.0,82126,85111


**Step 8**

In [40]:
"""
To keep the data consistent, we will convert all the numeric columns into floats.
This can be done by using as type and converting the strings into floats.
"""
# Converts last two columns into floats
wiki_data['PerCapitaGDP_2022'] = wiki_data['PerCapitaGDP_2022'].astype(float)
wiki_data['PerCapitaGDP_2023'] = wiki_data['PerCapitaGDP_2023'].astype(float)
wiki_data.head()

Unnamed: 0,State or federal district,GDP_2022,GDP_2023,PerCapitaGDP_2022,PerCapitaGDP_2023
1,California *,3598103.0,3755487.0,92190.0,96222.0
2,Texas *,2355960.0,2436346.0,78456.0,81130.0
3,New York *,2053180.0,2135672.0,104344.0,108380.0
4,Florida *,1389070.0,1468015.0,62446.0,65390.0
5,Illinois *,1033310.0,1071552.0,82126.0,85111.0


In [None]:
wiki_data.dtypes

**Step 9**

In [41]:
"""
For easier reading, we will add commas to each numeric value. 
This will be done by using applymap and applying this to all the columns after the first. 
"""
# Adds comma separators to last 4 columns
# wiki_data.loc[:,1:] = wiki_data.iloc[:,1:].applymap(lambda x: '{:,}'.format(x))
# wiki_data.head()

  wiki_data.loc[:,1:] = wiki_data.iloc[:,1:].applymap(lambda x: '{:,}'.format(x))


Unnamed: 0,State or federal district,GDP_2022,GDP_2023,PerCapitaGDP_2022,PerCapitaGDP_2023
1,California *,3598103.0,3755487.0,92190.0,96222.0
2,Texas *,2355960.0,2436346.0,78456.0,81130.0
3,New York *,2053180.0,2135672.0,104344.0,108380.0
4,Florida *,1389070.0,1468015.0,62446.0,65390.0
5,Illinois *,1033310.0,1071552.0,82126.0,85111.0


**Step 10**

In [55]:
"""
The next item that will be changed will be the names of the states. 
Since some of them contain asterisks, it will not be possible to join them to another dataset. 
This can be completed with str.replace function.
"""
# Removes asterisk from State or federal district column
wiki_data['State or federal district'] = wiki_data['State or federal district'].str.replace('*', '')
wiki_data.head()

KeyError: 'State or federal district'

**Step 11**

In [43]:
"""
The last item that will be changed is the name of the first column. 
When joining, the same column for state will be needed. 
"""
# Renames State or federal district column
wiki_data = wiki_data.rename(columns={'State or federal district': 'StateName'})
wiki_data.head()

Unnamed: 0,StateName,GDP_2022,GDP_2023,PerCapitaGDP_2022,PerCapitaGDP_2023
1,California,3598103.0,3755487.0,92190.0,96222.0
2,Texas,2355960.0,2436346.0,78456.0,81130.0
3,New York,2053180.0,2135672.0,104344.0,108380.0
4,Florida,1389070.0,1468015.0,62446.0,65390.0
5,Illinois,1033310.0,1071552.0,82126.0,85111.0


With this data, the ethical implications that could be found can involve the use of wide scale data versus smaller scale. Since GDP by city can be hard to be obtain, the state GDP can be used. Howerver, this use can give the average but will lack the granularity city GDP can offer. 