1

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
nltk.download('punkt')


file_path = 'text_doc.txt'
with open(file_path) as file:
    text = file.read()


tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))

filtered_words = [word for word in tokens if word.lower() not in stop_words]
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(token) for token in filtered_words]
stemmed_words



2

In [None]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

# Load data
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
           'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", 
                   names=columns)

# Basic preprocessing
data = data.replace('?', np.nan).dropna()
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for col in numeric_cols:
    data[col] = pd.to_numeric(data[col])

# Simple binary features
data['age_high'] = (data['age'] > 60).astype(int)
data['bp_high'] = (data['trestbps'] > 140).astype(int)
data['chol_high'] = (data['chol'] > 240).astype(int)
data['target'] = (data['target'] > 0).astype(int)

# Select final features
features = ['age_high', 'bp_high', 'chol_high', 'cp', 'exang', 'target']
data = data[features].astype(int)

# Create and train model
model = BayesianNetwork([
    ('age_high', 'target'),
    ('bp_high', 'target'),
    ('chol_high', 'target'),
    ('cp', 'target'),
    ('exang', 'target')
])

model.fit(data, estimator=MaximumLikelihoodEstimator)
inference = VariableElimination(model)

# Make prediction
evidence = {
    'age_high': 0,    # age > 60
    'bp_high': 1,     # blood pressure > 140
    'chol_high': 1,   # cholesterol > 240
    'cp': 1 ,          # chest pain type
    'exang': 0       # exercise induced angina
}

result = inference.query(variables=['target'], evidence=evidence)
print(f"Probability of Heart Disease: {result.values[1]:.2%}")

# Print data distribution
print("\nData Distribution:")
for col in features:
    print(f"\n{col}:")
    print(data[col].value_counts().sort_index())

3

In [None]:

from sklearn.datasets import load_iris
from sklearn.decomposition import PCA

from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram,linkage

iris = load_iris()
X = iris.data

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled[:2]

pca = PCA(n_components=2)

X_pca = pca.fit_transform(X_scaled)
X_pca[:2]


Z = linkage(X_pca,'ward')
plt.figure(figsize=(10,6))
dendrogram(Z)
plt.show()

clustering = AgglomerativeClustering(n_clusters=3)
clustering.fit(X_pca)

plt.scatter(X_pca[:,0],X_pca[:,1],c=clustering.labels_,cmap='rainbow')



4

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import matplotlib.pyplot as plt

def get_links(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.content, 'html.parser')
        return {urljoin(url, a['href']) for a in soup.find_all('a', href=True) 
                if urljoin(url, a['href']).startswith(url)}
    except:
        return set()

# Websites to analyze
websites = [
    'https://www.ted.com',
    'https://www.goodreads.com',
    'https://www.airbnb.com',
    'https://www.khanacademy.org'
]

# Get links for each website
link_map = {url: get_links(url) for url in websites}

# Calculate PageRank
d = 0.85  # damping factor
pagerank = {url: 1/len(websites) for url in websites}

# Run PageRank algorithm for 20 iterations
for _ in range(20):
    new_rank = {}
    for page in websites:
        # Calculate incoming PageRank
        incoming_pr = sum(pagerank[src] / len(links) 
                         for src, links in link_map.items() 
                         if page in links and links)
        # Update PageRank
        new_rank[page] = (1 - d) / len(websites) + d * incoming_pr
    pagerank = new_rank

# Normalize scores
total = sum(pagerank.values())
pagerank = {url: score/total for url, score in pagerank.items()}

# Plot results
plt.figure(figsize=(10, 5))
sorted_pr = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)
sites, scores = zip(*sorted_pr)
plt.barh([site.replace('https://www.', '').replace('.com', '').replace('.org', '') 
          for site in sites], scores)
plt.xlabel('PageRank Score')
plt.title('Website PageRank Scores')
plt.tight_layout()
plt.show()

# Print scores
for url, score in sorted_pr:
    print(f'{url}: {score:.6f}')

5

In [None]:
import requests
from bs4 import BeautifulSoup

# Base URL of the e-commerce site with pagination structure
base_url = 'http://books.toscrape.com/catalogue/page-{}.html'

# Function to scrape product data from a single page
def scrape_books(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Ensure the request was successful
    soup = BeautifulSoup(response.content, 'html.parser')
    
    books = []
    for product in soup.select('article.product_pod'):
        title = product.h3.a['title']
        price = product.select_one('.price_color').get_text(strip=True)
        link = product.h3.a['href']
        full_link = requests.compat.urljoin(url, link)  # Join relative link with base URL

        books.append({
            'title': title,
            'price': price,
            'link': full_link
        })
    return books

# Loop through multiple pages
all_books = []
for page_num in range(1, 6):  # Adjust the range based on the number of pages
    url = base_url.format(page_num)
    try:
        books_data = scrape_books(url)
        if not books_data:  # Stop if no books are found on the page (end of pagination)
            break
        all_books.extend(books_data)
        print(f"Page {page_num} scraped successfully.")
    except requests.HTTPError as e:
        print(f"Failed to scrape page {page_num}: {e}")
        break

# Print all book data collected
for book in all_books:
    print(f"Title: {book['title']}")
    print(f"Price: {book['price']}")
    print(f"Link: {book['link']}")
    print('-' * 40)


b1

In [None]:
with open(fasta_file) as file:
    lines = file.readlines()
    print(lines)

sequence = ''

with open(fasta_file) as file:
    lines = file.readlines()
    sequence = ''.join(line.strip() for line in lines[1:]).upper()


a_count = sequence.count('A')
t_count = sequence.count('T')

total_count = len(sequence)

at_percent = ((a_count + t_count) * 100) / total_count 

g_count = sequence.count('G')
c_count = sequence.count('C')

gc_percent = ((g_count + c_count)*100)/total_count

at_percent / gc_percent

start_codon = 'ATG'
# stop_codon = 'TAA'
stop_codons = ['TAA', 'TAG', 'TGA']

start_index = sequence.find(start_codon)

coding_region = []

while start_index != -1:

    for stop_codon in stop_codons:
        stop_index = sequence.find(stop_codon,start_index+3)

        if stop_index != -1 and (stop_index - start_index) % 3 == 0:
            coding_seq = sequence[start_index:stop_index+3]
            coding_region.append(coding_seq)
            break

    start_index = sequence.find(start_codon,start_index+1)


motif = 'TATAA'

start_index = sequence.find(motif)

while start_index != -1:

    print(f"Motif '{motif}' found at positions {start_index}")

    start_index = sequence.find(motif,start_index+1)

b4