In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
import plotly.express as px
import plotly.graph_objects as go

In [2]:
nba_finals = pd.read_html('https://en.wikipedia.org/wiki/NBA_Finals')


In [3]:
print(f'Total tables: {len(nba_finals)}')


Total tables: 27


In [4]:
# Narrowing down the tables with match parameter
nba_finals = pd.read_html('https://en.wikipedia.org/wiki/NBA_Finals', match='Finals appearances')
print(f'Total tables: {len(nba_finals)}')


Total tables: 2


In [5]:
df = nba_finals[1]
df.head()

Unnamed: 0,P,Team,W,L,Win,Notes
0,185,Los Angeles Lakers,93,92,0.503,Holds the record for the most games in a Final...
1,129,Boston Celtics,77,52,0.597,Recorded the first ever sweep in the Finals. L...
2,53,Philadelphia 76ers,24,29,0.453,"Includes a record of 9–11 while in Syracuse, a..."
3,59,Golden State Warriors,34,25,0.576,Includes records of 10–6 while in Philadelphia...
4,48,New York Knicks,20,28,0.417,Lost 4–1 in their last Finals appearance in 1999.


In [6]:
hnl = pd.read_html('https://hr.wikipedia.org/wiki/Prva_hrvatska_nogometna_liga', match='Redni broj', header=0, index_col=0, converters={'Redni broj': str})
hnl[0]

Unnamed: 0_level_0,Klub,Prvak,Drugi,Treći,Ukupno
Redni broj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,Dinamo,22,4,2,28
2.0,Hajduk,6,12,6,24
3.0,Rijeka,1,7,5,13
4.0,Zagreb,1,2,3,6
5.0,Lokomotiva Zagreb,0,2,0,2
6.0,Osijek,0,1,7,8
7.0,Slaven Belupo,0,1,1,2
8.0,Inter Zaprešić,0,1,0,1
9.0,Varaždin,0,0,3,3
10.0,Hrvatski Dragovoljac Zagreb,0,0,1,1


In [7]:
hnl[0].info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 12 entries, 1.0 to 12.0
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Klub    12 non-null     object
 1   Prvak   12 non-null     int64 
 2   Drugi   12 non-null     int64 
 3   Treći   12 non-null     int64 
 4   Ukupno  12 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 576.0+ bytes


In [8]:
hnl_df = pd.DataFrame(hnl[0])

In [9]:
hnl_df.shape

(12, 5)

In [13]:
fig = px.bar(hnl_df, x='Klub', y='Ukupno', color='Prvak')
fig.show()


In [72]:
import requests
from bs4 import BeautifulSoup

In [86]:
url = 'https://en.wikipedia.org/wiki/Triple_jump_world_record_progression'

column_names = []
data = []

# Create a handle, page, to handle the contents of the website
page = requests.get(url)

# Store the html in the soup object
soup = BeautifulSoup(page.content, 'html.parser')

# Gets the table html
table = soup.find_all('table')[0]

# Gets the table header
thead = table.find_all('th')

# Puts the header into the column names list. This will be used for the dict keys later
for th in thead:
  column_names.append(th.get_text())

# Gets all the rows of the table
rows = table.find_all('tr')
# Not using first row as it's header
for row in rows[1:]:
  # Creates a list with each index being a different entry in the row
  values = [r for r in row]
  # Gets each values that we care about
  mark = values[1].get_text()
  athlete = values[5].get_text()
  date = values[7].get_text()
  location = values[9].get_text()
  temp_list = [mark, athlete, date, location]
  # Creates a dictionary with keys being the column names and values being temp_list
  data.append(dict(zip(column_names, temp_list)))

triple_jump_df = pd.DataFrame(data)
triple_jump_df



Unnamed: 0,Mark,Wind,Athlete,Date
0,15.52 m (50 ft 11 in),Dan Ahearn (USA),1911-05-3030 May 1911,"New York City, U.S.[1]\n"
1,15.52 m (50 ft 11 in),Nick Winter (AUS),1924-07-1212 July 1924,"Paris, France[1]\n"
2,15.58 m (51 ft 1+1⁄4 in),Mikio Oda (JPN),1931-10-2727 October 1931,"Tokyo, Japan[1]\n"
3,15.72 m (51 ft 6+3⁄4 in),Chuhei Nambu (JPN),1932-08-1414 August 1932,"Los Angeles, U.S.[1]\n"
4,15.78 m (51 ft 9+1⁄4 in),Jack Metcalfe (AUS),1935-12-1414 December 1935,"Sydney, Australia[1]\n"
5,16.00 m (52 ft 5+3⁄4 in),Naoto Tajima (JPN),1936-08-066 August 1936,"Berlin, Germany[1]\n"
6,16.00 m (52 ft 5+3⁄4 in),Adhemar da Silva (BRA),1950-12-033 December 1950,"São Paulo, Brazil[1]\n"
7,16.01 m (52 ft 6+1⁄4 in),Adhemar da Silva (BRA),1951-09-3030 September 1951,"Rio de Janeiro, Brazil[1]\n"
8,16.12 m (52 ft 10+1⁄2 in),Adhemar da Silva (BRA),1952-07-2323 July 1952,"Helsinki, Finland[1]\n"
9,16.22 m (53 ft 2+1⁄2 in),Adhemar da Silva (BRA),1952-07-2323 July 1952,"Helsinki, Finland[1]\n"


In [88]:
# Insert country column using Athlete string slice
triple_jump_df.insert(3, 'Country', triple_jump_df['Athlete'].str.slice(start=-4, stop=-1, step=1))

ValueError: cannot insert Country, already exists

In [91]:
# Slicing strings to get only numbers
triple_jump_df['Mark'] = triple_jump_df['Mark'].str.slice(0,5)

In [99]:
# Renaming some columns
triple_jump_df = triple_jump_df.rename(columns={'Wind': 'Athlete', 'Athlete': 'Date', 'Date': 'Location'})

In [100]:
# Slicing to get date value
triple_jump_df['Date']= triple_jump_df['Date'].str.slice(stop=10)

In [106]:
# Slicing the location value
triple_jump_df['Location'] = triple_jump_df['Location'].str.slice(stop=-4)

In [102]:
# Changing the index to date
triple_jump_df.set_index('Date')

Unnamed: 0_level_0,Mark,Athlete,Country,Location
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1911-05-30,15.52,Dan Ahearn (USA),USA,"New York City, U.S.[1]\n"
1924-07-12,15.52,Nick Winter (AUS),AUS,"Paris, France[1]\n"
1931-10-27,15.58,Mikio Oda (JPN),JPN,"Tokyo, Japan[1]\n"
1932-08-14,15.72,Chuhei Nambu (JPN),JPN,"Los Angeles, U.S.[1]\n"
1935-12-14,15.78,Jack Metcalfe (AUS),AUS,"Sydney, Australia[1]\n"
1936-08-06,16.0,Naoto Tajima (JPN),JPN,"Berlin, Germany[1]\n"
1950-12-03,16.0,Adhemar da Silva (BRA),BRA,"São Paulo, Brazil[1]\n"
1951-09-30,16.01,Adhemar da Silva (BRA),BRA,"Rio de Janeiro, Brazil[1]\n"
1952-07-23,16.12,Adhemar da Silva (BRA),BRA,"Helsinki, Finland[1]\n"
1952-07-23,16.22,Adhemar da Silva (BRA),BRA,"Helsinki, Finland[1]\n"


In [111]:
triple_jump_df['Date'] = pd.to_datetime(triple_jump_df["Date"]).dt.strftime('%Y-%m-%d')

In [112]:
triple_jump_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Mark      27 non-null     object
 1   Athlete   27 non-null     object
 2   Date      27 non-null     object
 3   Country   27 non-null     object
 4   Location  27 non-null     object
dtypes: object(5)
memory usage: 1.2+ KB


In [142]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=triple_jump_df.Date, y=triple_jump_df.Mark, mode='lines+markers', text=triple_jump_df['Mark'] + ' m' + triple_jump_df['Athlete']))
fig.update_layout(title='Triple jump progression', xaxis_title='Year', yaxis_title='Height (m)')

fig.update_layout(
  xaxis=dict(showline=True, showgrid=True, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', 
  tickfont=dict(family='Arial', size=12, color='rgb(82, 82, 82)')),
  autosize=False,
  margin=dict(autoexpand=False, l=40, r=20, t=30),
  plot_bgcolor='rgb(230, 230, 230)')
fig.show()


In [143]:
triple_jump_df.to_csv('triple-jump.csv')