In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk

nltk.download("stopwords", download_dir="./data_model/")
nltk.data.path.append(os.path.abspath("./data_model/"))

[nltk_data] Downloading package stopwords to ./data_model/...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
text_file_dir="./data/wikispeedia_articles_plaintext/plaintext_articles/"
file_data=[]

for filename in os.listdir(text_file_dir):
    with open(os.path.join(text_file_dir, filename), "r") as file:
        content = file.read()
    
        # Split the content into lines to remove the header
    lines = content.split('\n')

    #Removing the header (line 0)
    if lines:
        lines.pop(0)

    # Making it an array and removing all \n
    content = '\n'.join(lines)
    content = content.replace("\n", " ")


    new_file_content= {"text_content": content, "title": filename}
    
    file_data.append(new_file_content)

project_data = pd.DataFrame(file_data)

project_data

Unnamed: 0,text_content,title
0,Áedán mac Gabráin 2007 Schools Wikipedia Sel...,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in.txt
1,Åland 2007 Schools Wikipedia Selection. Rela...,%C3%85land.txt
2,Édouard Manet 2007 Schools Wikipedia Selecti...,%C3%89douard_Manet.txt
3,Éire 2007 Schools Wikipedia Selection. Relat...,%C3%89ire.txt
4,Óengus I of the Picts 2007 Schools Wikipedia...,%C3%93engus_I_of_the_Picts.txt
...,...,...
4599,Zirconium 2007 Schools Wikipedia Selection. ...,Zirconium.txt
4600,Zoroaster 2007 Schools Wikipedia Selection. ...,Zoroaster.txt
4601,Zuid-Gelders 2007 Schools Wikipedia Selectio...,Zuid-Gelders.txt
4602,Zulu 2007 Schools Wikipedia Selection. Relat...,Zulu.txt


In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

model = AutoModelForSequenceClassification.from_pretrained("bucketresearch/politicalBiasBERT")

In [4]:
_stopwords = nltk.corpus.stopwords.words("english")

In [5]:
def clean(text):  #Removing unecessary punctuation and all lower case.
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = text.replace('„','')
    text = text.replace('“','')
    text = text.replace('"','')
    text = text.replace('\'','')
    text = text.replace('-','')
    text = text.lower()
    return text

def remove_stopwords(content):
    for word in _stopwords:
        content = content.replace(' '+word+' ',' ')
    return content

In [6]:
project_data['text_content'] = project_data['text_content'].apply(clean)
project_data['text_content'] = project_data['text_content'].apply(remove_stopwords)

In [7]:
pred_class=[]
for index in range(len(project_data)):
    row=project_data.iloc[index]
    text=row["text_content"]
    if len(text)>512:
        text=text[0:511]
    
    
    inputs = tokenizer(text, return_tensors="pt")
    
    outputs = model(**inputs)
    logits=outputs.logits
    probabilities = torch.softmax(logits, dim=-1)

    # [0] -> left 
    # [1] -> center
    # [2] -> right
    #probabilities=logits.softmax(dim=-1)[0].tolist() 
    predicted_class = torch.argmax(probabilities, dim=-1).item()
    print("%d/%d, pred_class=%d"%(index+1,len(project_data),int(predicted_class)))
    pred_class.append(int(predicted_class))

1/4604, pred_class=0
2/4604, pred_class=0
3/4604, pred_class=0
4/4604, pred_class=0
5/4604, pred_class=0
6/4604, pred_class=0
7/4604, pred_class=0
8/4604, pred_class=0
9/4604, pred_class=0
10/4604, pred_class=0
11/4604, pred_class=0
12/4604, pred_class=0
13/4604, pred_class=0
14/4604, pred_class=0
15/4604, pred_class=0
16/4604, pred_class=0
17/4604, pred_class=0
18/4604, pred_class=0
19/4604, pred_class=0
20/4604, pred_class=0
21/4604, pred_class=0
22/4604, pred_class=0
23/4604, pred_class=0
24/4604, pred_class=0
25/4604, pred_class=0
26/4604, pred_class=0
27/4604, pred_class=0
28/4604, pred_class=0
29/4604, pred_class=0
30/4604, pred_class=0
31/4604, pred_class=0
32/4604, pred_class=0
33/4604, pred_class=0
34/4604, pred_class=0
35/4604, pred_class=0
36/4604, pred_class=0
37/4604, pred_class=0
38/4604, pred_class=0
39/4604, pred_class=0
40/4604, pred_class=0
41/4604, pred_class=0
42/4604, pred_class=0
43/4604, pred_class=0
44/4604, pred_class=0
45/4604, pred_class=0
46/4604, pred_class

In [8]:
project_data["Politic_Part"]=pred_class
 

In [9]:
vis_pred=project_data["Politic_Part"].value_counts()
vis_pred # Nombre d'articles classifiés


Politic_Part
0    4477
2      67
1      60
Name: count, dtype: int64

In [24]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
filt=project_data["title"][project_data["Politic_Part"]==2]
print(filt.head)

<bound method NDFrame.head of 64                                             Aachen.txt
112                                      Adolf_Hitler.txt
243                                    Andrew_Jackson.txt
244                                    Andrew_Johnson.txt
345        Arthur_Wellesley%2C_1st_Duke_of_Wellington.txt
455                                      Barack_Obama.txt
547                                  Benito_Mussolini.txt
549     Benjamin_Disraeli%2C_1st_Earl_of_Beaconsfield.txt
551                                 Benjamin_Harrison.txt
757                                   Calvin_Coolidge.txt
875                                 Chester_A._Arthur.txt
939                                    Climate_change.txt
1146                                    David_Cameron.txt
1230                                         Donation.txt
1244                                          Dresden.txt
1268                             Dwight_D._Eisenhower.txt
1472                                      

In [11]:
project_data["title"][project_data["Politic_Part"]==1]

83                                     Able_Archer_83.txt
141                                            Airbus.txt
256                                Anglican_Communion.txt
327                                      Ariel_Sharon.txt
376                                  ATLAS_experiment.txt
451                                          Banknote.txt
572                                          Big_Bang.txt
750                                           Calgary.txt
811     Carolingian_Gospel_Book_%28British_Library%2C_...
1018                          Constitutional_monarchy.txt
1025                           Control_car_%28rail%29.txt
1039                                           Cornea.txt
1062                                           Craton.txt
1063                   Creation-evolution_controversy.txt
1105                                    Cyclone_Percy.txt
1106                                   Cyclone_Rosita.txt
1139                                      Darth_Vader.txt
1154          

In [25]:
project_data[project_data["title"]=="Joseph_Stalin.txt"]

Unnamed: 0,text_content,title,Politic_Part
2299,joseph stalin 2007 schools wikipedia selectio...,Joseph_Stalin.txt,2
