In [5]:
# Day 20 of 30 Days of Python

# Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'
import requests

# Find the 10 most repeated words in the romeo_and_juliet.txt
import re
from collections import Counter

def find_most_repeated_words(url, num_words):

    response = requests.get(url)
    text = response.text
    # Tokenize the text into words
    words = re.findall(r'\b\w+\b', text)

    # Count the frequency of each word
    word_counts = Counter(words)

    # Get most common words and their frequencies
    most_repeated_words = word_counts.most_common(num_words)

    return most_repeated_words

# Function call to return most repeated words
result = find_most_repeated_words('https://www.gutenberg.org/files/1513/1513-h/1513-h.htm', 10)
print("Most repeated words in Romeo and Juliet text are:", result)



Most repeated words in Romeo and Juliet text are: [('br', 2893), ('p', 2119), ('class', 1217), ('drama', 874), ('the', 782), ('I', 583), ('a', 580), ('and', 551), ('to', 535), ('of', 479)]


In [22]:
# Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find :
# (i) the min, max, mean, median, standard deviation of cats' weight in metric units.
# (ii) the min, max, mean, median, standard deviation of cats' lifespan in years.
# (iii) Create a frequency table of country and breed of cats

import requests
import numpy as np

# Define the Cat API URL
cats_api = 'https://api.thecatapi.com/v1/breeds'

# Fetch data from the Cat API
response = requests.get(cats_api)
breeds_data = response.json()

# Extract weights from the API response in metric unit
weights = []

for breed in breeds_data:
    weight_str = breed.get('weight', {}).get('metric', '')
    
    # Handle weight ranges (e.g., '3 - 5' kg)
    if '-' in weight_str:
        # Take the average of the range
        weight_values = [float(value.strip()) for value in weight_str.split('-')]
        avg_weight = sum(weight_values) / len(weight_values)
        weights.append(avg_weight)
    else:
        # Convert single values to float
        weights.append(float(weight_str) if weight_str else 0.0)
        
# Calculate statistics
min_weight = min(weights)
max_weight = max(weights)
mean_weight = np.mean(weights).round(2)
median_weight = np.median(weights)
std_dev_weight = np.std(weights).round(2)

# Display the results
print("SUMMARY STATISTICS FOR CAT WEIGHT")
print(f"Min Weight: {min_weight} kg")
print(f"Max Weight: {max_weight} kg")
print(f"Mean Weight: {mean_weight} kg")
print(f"Median Weight: {median_weight} kg")
print(f"Standard Deviation of Weight: {std_dev_weight} kg")

# Extract lifespan from the API response
lifespan = []

for breed in breeds_data:
    lifespan_str = breed.get('life_span','')
    
    # Handle lifespan ranges (e.g., '3 - 5' years)
    if '-' in lifespan_str:
        # Take the average of the range
        lifespan_values = [float(value.strip()) for value in lifespan_str.split('-')]
        avg_lifespan = sum(lifespan_values) / len(lifespan_values)
        lifespan.append(avg_lifespan)
    else:
        # Convert single values to float
        lifespan.append(float(lifespan_str) if lifespan_str else 0.0)
        
# Calculate statistics
min_lifespan = min(lifespan)
max_lifespan = max(lifespan)
mean_lifespan = np.mean(lifespan).round(2)
median_lifespan = np.median(lifespan)
std_dev_lifespan = np.std(lifespan).round(2)

# Display the results
print("\nSUMMARY STATISTICS FOR CAT LIFESPAN")
print(f"Min Lifespan: {min_lifespan} years")
print(f"Max Lifespan: {max_lifespan} years")
print(f"Mean Lifespan: {mean_lifespan} years")
print(f"Median Lifespan: {median_lifespan} years")
print(f"Standard Deviation of Lifespan: {std_dev_lifespan} kg")

# Create a frequency table for country and breed
frequency_table = {}

for breed in breeds_data:
    country = breed.get('origin', 'Unknown')
    breed_name = breed.get('name', 'Unknown')

    # Increment the count in the frequency table
    key = f"{country} - {breed_name}"
    frequency_table[key] = frequency_table.get(key, 0) + 1

# Display the frequency table
print("\nFREQUENCY TABLE")
for key, count in frequency_table.items():
    print(f"{key}: {count} times")

SUMMARY STATISTICS FOR CAT WEIGHT
Min Weight: 3.0 kg
Max Weight: 7.5 kg
Mean Weight: 4.71 kg
Median Weight: 4.5 kg
Standard Deviation of Weight: 1.06 kg

SUMMARY STATISTICS FOR CAT LIFESPAN
Min Lifespan: 10.5 years
Max Lifespan: 19.0 years
Mean Lifespan: 13.75 years
Median Lifespan: 13.5 years
Standard Deviation of Lifespan: 1.57 kg

FREQUENCY TABLE
Egypt - Abyssinian: 1 times
Greece - Aegean: 1 times
United States - American Bobtail: 1 times
United States - American Curl: 1 times
United States - American Shorthair: 1 times
United States - American Wirehair: 1 times
United Arab Emirates - Arabian Mau: 1 times
Australia - Australian Mist: 1 times
United States - Balinese: 1 times
United States - Bambino: 1 times
United States - Bengal: 1 times
France - Birman: 1 times
United States - Bombay: 1 times
United Kingdom - British Longhair: 1 times
United Kingdom - British Shorthair: 1 times
Burma - Burmese: 1 times
United Kingdom - Burmilla: 1 times
United States - California Spangled: 1 time

In [45]:
# Read the countries from https://restcountries.com/v3.1/all and find
# (i) the 10 largest countries
# (ii) the 10 most spoken languages
# (iii) the total number of languages in the countries API

# Define the URL for the Restcountries API
restcountries_api = 'https://restcountries.com/v3.1/all'

# Fetch data from the Restcountries API
response = requests.get(restcountries_api)
countries_data = response.json()

# Extract relevant information (name and area) from the API response
countries_info = [(country.get('name', {}).get('common', 'Unknown'), country.get('area', 0)) for country in countries_data]

# Sort countries based on area in descending order
sorted_countries = sorted(countries_info, key=lambda x: x[1], reverse=True)

# Display the 10 largest countries
print("Top 10 Largest Countries:")
for i, (country_name, area) in enumerate(sorted_countries[:10], start=1):
    print(f"{i}. {country_name}: {area} square kilometers")

# Extract language information from each country
all_languages = [country.get('languages', {}) for country in countries_data]

# Flatten the list of languages
all_language_names = [language for languages in all_languages for language in languages.values()]

# Count the occurrences of each language
language_counts = {language: all_language_names.count(language) for language in set(all_language_names)}

# Sort languages based on the number of occurrences in descending order
sorted_languages = sorted(language_counts.items(), key=lambda x: x[1], reverse=True)

# Display the 10 most spoken languages
print("\nTop 10 Most Spoken Languages:")
for i, (language_name, count) in enumerate(sorted_languages[:10], start=1):
    print(f"{i}. {language_name}: {count} countries")

# Collect all unique languages from the countries
all_languages = set()

for country in countries_data:
    languages = country.get('languages', [])
    all_languages.update(languages)

# Print the total number of unique languages
total_languages = len(all_languages)
print(f"\nTotal number of languages: {total_languages}")

Top 10 Largest Countries:
1. Russia: 17098242.0 square kilometers
2. Antarctica: 14000000.0 square kilometers
3. Canada: 9984670.0 square kilometers
4. China: 9706961.0 square kilometers
5. United States: 9372610.0 square kilometers
6. Brazil: 8515767.0 square kilometers
7. Australia: 7692024.0 square kilometers
8. India: 3287590.0 square kilometers
9. Argentina: 2780400.0 square kilometers
10. Kazakhstan: 2724900.0 square kilometers

Top 10 Most Spoken Languages:
1. English: 91 countries
2. French: 46 countries
3. Arabic: 25 countries
4. Spanish: 24 countries
5. Portuguese: 10 countries
6. Dutch: 7 countries
7. Russian: 7 countries
8. German: 6 countries
9. Chinese: 5 countries
10. Swahili: 4 countries

Total number of languages: 155


In [47]:
import requests
from bs4 import BeautifulSoup

# Define the URL for UCI Machine Learning Repository
uci_url = 'https://archive.ics.uci.edu/ml/datasets'

# Make an HTTP request to the UCI website
response = requests.get(uci_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Print the title of the page
    title = soup.title.string.strip()
    print(f"Title of the page: {title}\n")

    # Print the content of the page (this will be a large output)
    print(soup.get_text())
else:
    print(f"Failed to retrieve the content. Status code: {response.status_code}")


Title of the page: UCI Machine Learning Repository












UCI Machine Learning Repository

Datasets - UCI Machine Learning Repository




       Datasets Contribute Dataset Donate New Link External About Us Who We Are Citation Metadata Contact Information           Login    Filters            Keywords     Data Type      Subject Area      Task      # Features      # Instances      Feature Type      Python    Browse Datasets   Filters  Sort by # Views, desc # Views   Name  # Instances  # Features  Date Donated  Relevance        Expand All Collapse All    Iris A small classic dataset from Fisher, 1936. One of the earliest known datasets used for evaluating classification methods.
  Classification  Tabular  150 Instances  4 Features      Heart Disease 4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach  Classification  Multivariate  303 Instances  13 Features      Adult Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. 