# Modeling

In [4]:
import wrangle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

**Get data**

In [8]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import os
from pprint import pprint
from datetime import datetime as dt


def get_news_articles():
    url = "https://inshorts.com/en/read"
    # set different categories to perse through
    category = ["business", "sports", "technology", "entertainment"]

    articles = {}
    df_setup = []
    for cat in category:
        # read the url
        res = get(url + "/" + category[0])
        print(res)

        # create a beautiful soup object
        soup_parser = BeautifulSoup(res.content, 'html.parser').body

        soup = soup_parser.find_all("span", itemprop="mainEntityOfPage")
        for i in range(len(soup)):
            link = soup[i]["itemid"]

            article = get(link)
            article_soup = BeautifulSoup(article.content,"html.parser").body

            article_title = article_soup.find('span', itemprop='headline').text
            article_body = article_soup.find('div', itemprop='articleBody').text
            # articles[f"article {cat} {i}"] = [article_title, cat ,link ,article_body]

            article_instance = {
                'title': article_title,
                'content': article_body,
                'category': cat,
            }

            df_setup.append(article_instance)
    return pd.DataFrame(df_setup)

In [18]:
rating = get_news_articles()

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [20]:
train, val, test = wrangle.prep_readmes(rating, "content")
train.shape, val.shape, test.shape

((24, 5), (8, 5), (8, 5))

**variables**

- df.stars = The stars column
- Random state = 95

In [22]:
# Generate random numbers between 1 and 5
random_numbers = np.random.randint(1, 6, size=len(train))
train["stars"] = random_numbers
train.head()

Unnamed: 0,title,content,category,cleaned_content,lemmatized,stars
4,"HC asks govt, Delhi to take action on illegal ...",Delhi High Court directed the Centre and the D...,business,delhi high court directed the centre and the d...,delhi high court directed centre delhi governm...,1
11,Remove duplication of PMJDY accounts: FM to Re...,Finance Minister Nirmala Sitharaman asked Regi...,sports,finance minister nirmala sitharaman asked regi...,finance minister nirmala sitharaman asked regi...,3
33,US GDP growth revised down to 2.1% annual rate...,The US economy grew at 2.1% annual pace in the...,entertainment,the us economy grew at 21 annual pace in the s...,u economy grew 21 annual pace second quarter i...,4
3,US GDP growth revised down to 2.1% annual rate...,The US economy grew at 2.1% annual pace in the...,business,the us economy grew at 21 annual pace in the s...,u economy grew 21 annual pace second quarter i...,3
39,Titagarh Rail bags ₹350-cr Gujarat Metro contr...,"Titagarh Rail Systems, formerly Titagarh Wagon...",entertainment,titagarh rail systems formerly titagarh wagons...,titagarh rail system formerly titagarh wagon r...,3


## Baseline

In [25]:
df = train.copy()

most_common_star = df["stars"].value_counts().idxmax()
most_common_freq = df["stars"].value_counts().max()

baseline_acc = most_common_freq / len(df)

print(f"Most Common Star: {most_common_star}")
print(f"Baseline Accuracy: {baseline_acc:.2f}")

Most Common Star: 2
Baseline Accuracy: 0.33


**Significant Words**

In [None]:
significant_words = ['learning', 'test', 'library', 'create', 'line']

for word in significant_words:
    df[word] = df["lemmatized"].apply(lambda x: x.count(word))
    
for word in significant_words:
    validate[word] = validate["lemmatized"].apply(lambda x: x.count(word))
    
for word in significant_words:
    test[word] = test["lemmatized"].apply(lambda x: x.count(word))
    
X_train = df[significant_words]
y_train = df["lemmatized"]

X_val = validate[significant_words]
y_val = validate["lemmatized"]

X_test = test[significant_words]
y_test = test["lemmatized"]

## KNN

In [None]:
k = 3  
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

y_train_pred = knn.predict(X_train)
y_val_pred = knn.predict(X_val)

train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Validation Accuracy: {val_accuracy:.2f}")

## Decision Tree

In [None]:
tree = DecisionTreeClassifier(random_state=95)
tree.fit(X_train, y_train)

y_train_pred = tree.predict(X_train)
y_val_pred = tree.predict(X_val)

train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Validation Accuracy: {val_accuracy:.2f}")

## Random Forest

In [None]:
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)

y_train_pred = forest.predict(X_train)
y_val_pred = forest.predict(X_val)

train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Validation Accuracy: {val_accuracy:.2f}")


## Logistic Regression

In [None]:
logReg = LogisticRegression(random_state=42)
logReg.fit(X_train, y_train)

y_train_pred = logReg.predict(X_train)
y_val_pred = logReg.predict(X_val)

train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Validation Accuracy: {val_accuracy:.2f}")

## Test