In [22]:
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python3 -m spacy download pl_core_news_sm

In [None]:
import pandas as pd
import requests
from time import sleep
import html
import unicodedata
from bs4 import BeautifulSoup
import spacy
import pl_core_news_sm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 

In [3]:
genre = 'polityka-obronna'
page_number = 2
http_address = f'https://defence24.pl/zobacz-wiecej-polityka-obronna?page={page_number}'

response = requests.get(http_address)
content = response.content.decode('utf-8')
content = html.unescape(content)
content = unicodedata.normalize('NFKD',content)

In [5]:
soup = BeautifulSoup(content, 'html.parser')
a = soup.find_all('a', {'class': 'tile__title stretched-link'})
a[0].get_text().strip()

'Warszawa proponuje nowe podejście do broni atomowej. Berlin zaakceptuje? [KOMENTARZ]'

In [7]:
def get_data(genre, number_of_pages):
    
    labels = []
    all_titles = []


    for page_number in range(1, number_of_pages): # iteracja po numerze strony
            print(f'Getting data about {genre} from page number {page_number}')
            
            http_address = f'https://defence24.pl/{genre}?page={page_number}' # genre - zakładka film
            response = requests.get(http_address)
            content = response.content.decode("utf-8")
            content = html.unescape(content)
            content = unicodedata.normalize('NFKD',content)

            soup = BeautifulSoup(content, 'html.parser')
            a = soup.find_all('a', {'class': 'tile__title stretched-link'})
            titles = [x.get_text().strip() for x in a]

            all_titles.extend(titles)

            
            labels.extend([genre.replace("zobacz-wiecej-","")] * len(titles))
            
            sleep(5)
        
    return {'title': all_titles, 'label':labels}

In [23]:
    # websites_catalogs = ["zobacz-wiecej-polityka-obronna","zobacz-wiecej-przemysl","zobacz-wiecej-geopolityka","zobacz-wiecej-sily-zbrojne"]
    # lista = []
    # for catalogs in websites_catalogs:
    #  x = get_data(catalogs,10)
    #  lista.append(x)

In [9]:
df1 = pd.DataFrame(lista[0])
df2 = pd.DataFrame(lista[1])
df3 = pd.DataFrame(lista[2])
df4 = pd.DataFrame(lista[3])

In [10]:
nlp = spacy.load('pl_core_news_sm') 

In [11]:
df_all = pd.concat([df1,df2,df3,df4]).sample(frac=1).reset_index(drop=True)

title_list_lem = []
for title in df_all["title"]:
  doc = nlp(title)
  lista1 = []
  for token in doc:
    lista1.append(token.lemma_)
  title_list_lem.append(" ".join(lista1))

In [12]:
title_list_stop = []
for title in df_all["title"]:
  doc = nlp(title)
  lista1 = []
  for token in doc:
    if token.is_stop == False:
      lista1.append(token.lemma_)
  title_list_stop.append(" ".join(lista1))

In [13]:
df_all["token_stop"] = title_list_stop
df_all["token_lem"] = title_list_lem

In [14]:
X1 = df_all['title']
y = df_all['label'] 

X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.3, random_state=0, stratify=y)

X2 = df_all['token_lem']
y = df_all['label'] 

X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.3, random_state=0, stratify=y)

X3 = df_all['token_stop']
y = df_all['label'] 

X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=0.3, random_state=0, stratify=y)

In [15]:
tfidf1 = TfidfVectorizer()
tfidf1.fit(X1_train) 

tfidf2 = TfidfVectorizer()
tfidf2.fit(X2_train)

tfidf3 = TfidfVectorizer()
tfidf3.fit(X3_train) 

In [16]:
X1_train_tf = tfidf1.transform(X1_train)
X1_test_tf = tfidf1.transform(X1_test)

X2_train_tf = tfidf2.transform(X2_train)
X2_test_tf = tfidf2.transform(X2_test)

X3_train_tf = tfidf3.transform(X3_train)
X3_test_tf = tfidf3.transform(X3_test)

In [17]:
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression()
lr1.fit(X1_train_tf, y_train)


lr2 = LogisticRegression()
lr2.fit(X2_train_tf, y_train)


lr3 = LogisticRegression()
lr3.fit(X3_train_tf, y_train)

In [18]:
y1_pred = lr1.predict(X1_test_tf)

y2_pred = lr2.predict(X2_test_tf)

y3_pred = lr3.predict(X3_test_tf)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y1_pred))

print(classification_report(y_test, y2_pred))

print(classification_report(y_test, y3_pred))

                  precision    recall  f1-score   support

     geopolityka       0.41      0.44      0.42        25
polityka-obronna       0.50      0.48      0.49        25
        przemysl       0.44      0.50      0.47        24
    sily-zbrojne       0.25      0.21      0.23        24

        accuracy                           0.41        98
       macro avg       0.40      0.41      0.40        98
    weighted avg       0.40      0.41      0.40        98

                  precision    recall  f1-score   support

     geopolityka       0.62      0.52      0.57        25
polityka-obronna       0.59      0.52      0.55        25
        przemysl       0.45      0.54      0.49        24
    sily-zbrojne       0.27      0.29      0.28        24

        accuracy                           0.47        98
       macro avg       0.48      0.47      0.47        98
    weighted avg       0.48      0.47      0.47        98

                  precision    recall  f1-score   support

     ge

In [20]:
sentence = ["czolg"]
sentence_trans = tfidf1.transform(sentence)
lr1.predict(sentence_trans)


array(['sily-zbrojne'], dtype=object)

In [21]:
print(f"The word/sentence: {sentence} matches to catalog called {lr1.predict(sentence_trans)}")

The word/sentence: ['czolg'] matches to catalog called ['sily-zbrojne']
