In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

Con el siguiente código, extraemos información de internet relacionada con el índice S&P500 Dónde se exploran los sectores que lo componen y su información en general.

**List of S&P 500 companies**

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
page = requests.get(url)
# Analizar el contenido HTML de la respuesta con BeautifulSoup
mysoup = BeautifulSoup(page.text, 'html.parser')# Ingresamos la URL a la que se va a hacer web scraping

# Hacemos la solicitud a la URL y obtener la respuesta


In [5]:
# Buscar la tabla que contiene los datos que necesitas
table = mysoup.find('table', {'class': 'wikitable sortable'})

In [6]:
# Obtener los datos de la tabla
rows = table.find_all('tr')
data = []
for row in rows:
    cells = row.find_all('td')
    if len(cells) > 0:
        data.append({'Security': cells[0].text.strip(), 'Symbol': cells[1].text.strip(), 'GICS Sector': cells[2].text.strip(), 'GICS Sub-Industry': cells[3].text.strip(), 'Headquarters Location': cells[4].text.strip(), 'Date added': cells[5].text.strip(), 'CIK': cells[6].text.strip(), 'Founded': cells[7].text.strip() })

# Crear un DataFrame con los datos obtenidos
df = pd.DataFrame(data)


In [7]:
df

Unnamed: 0,Security,Symbol,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,0000066740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,0000091142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,0000001800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,0001551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,0001467373,1989
...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,0001041061,1997
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,0000877212,1969
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,0001136869,1927
501,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,0000109380,1873


In [8]:
df['Date added'] = pd.to_datetime(df['Date added'], errors='coerce')
df['CIK'] = df['CIK'].astype(int)

In [9]:
count_df = df['GICS Sector'].value_counts().reset_index()
count_df

Unnamed: 0,index,GICS Sector
0,Industrials,73
1,Financials,73
2,Information Technology,66
3,Health Care,65
4,Consumer Discretionary,53
5,Consumer Staples,37
6,Utilities,30
7,Real Estate,30
8,Materials,29
9,Communication Services,24


In [10]:
count_df = df['GICS Sub-Industry'].value_counts().reset_index()
count_df

Unnamed: 0,index,GICS Sub-Industry
0,Health Care Equipment,19
1,Semiconductors,15
2,Industrial Machinery & Supplies & Components,14
3,Application Software,14
4,Electric Utilities,13
...,...,...
121,Consumer Electronics,1
122,Health Care Technology,1
123,Leisure Products,1
124,Real Estate Services,1


In [11]:
df_filtrado = df.loc[df['GICS Sector'] == "Information Technology"]

In [12]:
df_filtrado

Unnamed: 0,Security,Symbol,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
7,ADBE,Adobe Inc.,Information Technology,Application Software,"San Jose, California",1997-05-05,796343,1982
14,AKAM,Akamai,Information Technology,Internet Services & Infrastructure,"Cambridge, Massachusetts",2007-07-12,1086222,1998
27,AMD,AMD,Information Technology,Semiconductors,"Santa Clara, California",2017-03-20,2488,1969
39,APH,Amphenol,Information Technology,Electronic Components,"Wallingford, Connecticut",2008-09-30,820313,1932
...,...,...,...,...,...,...,...,...
454,TRMB,Trimble Inc.,Information Technology,Electronic Equipment & Instruments,"Westminster, Colorado",2021-01-21,864749,1978
456,TYL,Tyler Technologies,Information Technology,Application Software,"Plano, Texas",2020-06-22,860731,1966
469,VRSN,Verisign,Information Technology,Internet Services & Infrastructure,"Dulles, Virginia",2006-02-01,1014473,1995
488,WDC,Western Digital,Information Technology,"Technology Hardware, Storage & Peripherals","San Jose, California",2009-07-01,106040,1970


In [13]:
df_filtrado = df_filtrado['GICS Sub-Industry'].value_counts().reset_index()
df_filtrado

Unnamed: 0,index,GICS Sub-Industry
0,Semiconductors,15
1,Application Software,14
2,IT Consulting & Other Services,6
3,"Technology Hardware, Storage & Peripherals",6
4,Semiconductor Materials & Equipment,5
5,Communications Equipment,5
6,Electronic Equipment & Instruments,5
7,Electronic Components,3
8,Systems Software,3
9,Internet Services & Infrastructure,2


In [14]:
df_filtrado1 = df.loc[df['GICS Sub-Industry'] == "IT Consulting & Other Services"]
df_filtrado1

Unnamed: 0,Security,Symbol,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
119,CTSH,Cognizant,Information Technology,IT Consulting & Other Services,"Teaneck, New Jersey",2006-11-17,1058290,1994
162,DXC,DXC Technology,Information Technology,IT Consulting & Other Services,"Tysons Corner, Virginia",2017-04-04,1688568,2017
176,EPAM,EPAM Systems,Information Technology,IT Consulting & Other Services,"Newtown, Pennsylvania",2021-12-14,1352010,1993
214,IT,Gartner,Information Technology,IT Consulting & Other Services,"Stamford, Connecticut",2017-04-05,749251,1979
247,IBM,IBM,Information Technology,IT Consulting & Other Services,"Armonk, New York",1957-03-04,51143,1911


In [15]:
df_filtrado.to_excel('S&P_Information Technology.xlsx', index=False)