# NLP Opinion Mining on Wiki Text (Modelling)

__Importing Required Libraries__

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import re
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt_tab to C:\Users\Shreyansh
[nltk_data]     Padarha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


## Loading the Segregated Dataset into the Environment
__Pre-Processed in ```pre_processing.ipynb```__

In [2]:
df = pd.read_feather('Data/wikiarticles_seg_data.feather')

# Combining month and year into a datetime column
df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

# Not considering the References category
df = df[df.category != 'References']

In [7]:
article_counts = df['article_name'].value_counts()

# Calculating the percentage split
print("Percentage split of articles")
print((article_counts / article_counts.sum()) * 100)

Percentage split of articles
article_name
Vladimir Putin    81.686076
Xi Jinping        18.313924
Name: count, dtype: float64


__Only taking One Revision/Version Per Month__

In [9]:
# Identifying the maximum file_id for each month-year-article_name combination
max_file_ids = (
    df.groupby(['month', 'year', 'article_name'])['file_id']
    .transform('max')
)

# Filtering rows where file_id is equal to the maximum file_id for its group
filtered_df = df[df['file_id'] == max_file_ids]

In [10]:
article_counts_new = filtered_df['article_name'].value_counts()
print("Percentage split of articles after filtering")
print((article_counts_new / article_counts_new.sum()) * 100)

Percentage split of articles after filtering
article_name
Vladimir Putin    67.814162
Xi Jinping        32.185838
Name: count, dtype: float64


In [11]:
filtered_df.sample(3)

Unnamed: 0,title,text,file_id,month,year,article_name,category,date
206,Policies,"On March , Putin won the Russian presidentia...",547791410,3,2013,Vladimir Putin,policy,2013-03-01
191575,Electoral history,", a monument to victims of Stalinist repressio...",1036447862,7,2021,Vladimir Putin,Assessment,2021-07-01
119442,Putin-related humour,"Putin on Chechen extremists, on September , : ...",61342827,6,2006,Vladimir Putin,Public Imaage,2006-06-01


---

# Bias Assessment

### Implementation using **Dbias** - Detecting Bias and ensuring Fairness in AI solutions

To detect bias and fairness in sentences, the Dbias classification model was employed. This model was trained on the MBIC (Media Bias Identification Corpus) Dataset by the researchers, leveraging the DistilBERT-base-uncased model as its foundation. Training was conducted for 30 epochs, with a batch size of 16, a learning rate of 5e-5, and a maximum sequence length set to 512 tokens. This setup enables the model to effectively assess bias and fairness in text, particularly within news articles, providing insights based on the nuances captured in the training dataset.

**Credit**<br>
Raza, S., Reji, D. J., & Ding, C. (2022). Dbias: Detecting biases and ensuring fairness in news articles.<br>
*International Journal of Data Science and Analytics*, 1-21. Springer. https://doi.org/10.1007/s41060-022-00359-4

**Github Repo Link:** https://github.com/dreji18/Fairness-in-AI/tree/main




In [None]:
from Dbias.bias_classification import *

In [None]:
# returns classification label for a given sentence fragment. (Sample Test)
classifier("Nevertheless, Trump and other Republicans have tarred the protests as havens for terrorists intent on destroying property.")

[{'label': 'Biased', 'score': 0.9938021898269653}]

In [164]:
def chunk_and_classify(corpus, chunk_size=250, overlap=50):
    """
    Splits the corpus into overlapping chunks, classifies each chunk as 'Biased' or 'Non-biased',
    assigns a score based on the classification, and averages the scores.
    
    Parameters:
    - corpus (str): The text corpus to be chunked and classified.
    - chunk_size (int): Number of base tokens per chunk.
    - overlap (int): Number of overlapping tokens between consecutive chunks.
    
    Returns:
    - tuple: (float, str) The average score of all chunks and the final classification.
    """
    tokens = corpus.split()  # Tokenize corpus (use actual tokenizer if needed)
    chunk_scores = []
    
    try:
        i = 0
        while i < len(tokens):
            # Create a chunk with overlap
            chunk = tokens[i:i + chunk_size + overlap]
            chunk_text = " ".join(chunk)
            
            # Classify chunk
            result = classifier(chunk_text)[0]
            score = result['score']
            
            # Adjust score based on label
            if result['label'] == 'Non-biased':
                chunk_scores.append(score)  # Keep score as positive
            elif result['label'] == 'Biased':
                chunk_scores.append(-score)  # Make score negative

            # Move index forward by chunk_size to get the next chunk
            i += chunk_size
            
    except:
        i = 0
        while i < len(tokens):
            # Create a chunk with overlap
            chunk = tokens[i:i + chunk_size-100 + overlap-25]
            chunk_text = " ".join(chunk)
            
            # Classify chunk
            result = classifier(chunk_text)[0]
            score = result['score']
            
            # Adjust score based on label
            if result['label'] == 'Non-biased':
                chunk_scores.append(score)  # Keep score as positive
            elif result['label'] == 'Biased':
                chunk_scores.append(-score)  # Make score negative

            # Move index forward by chunk_size to get the next chunk
            i += chunk_size

    # Calculate the average score of all chunks
    average_score = np.mean(chunk_scores)
    final_classification = "Non-biased" if average_score >= 0 else "Biased"
    
    return average_score, final_classification



In [165]:
filtered_df[['bias_score', 'bias_class']] = filtered_df['text'].apply(
    lambda corpus: pd.Series(chunk_and_classify(corpus, chunk_size=250, overlap=50))
)

In [14]:
filtered_df.sample(3)

Unnamed: 0,title,text,file_id,month,year,article_name,category,date,bias_score,bias_class
199256,Foreign policy,"Putin's domestic policies, particularly early ...",981043171,9,2020,Vladimir Putin,Public Imaage,2020-09-01,-0.637244,Biased
295467,Leadership,in August ]] on September ]] Xi was appointed...,991542613,11,2020,Xi Jinping,Leadership Tenures,2020-11-01,-0.516696,Biased
73404,Third Presidency (2012–present),]] Putin was barred from a third term by the C...,520098174,10,2012,Vladimir Putin,Leadership Tenures,2012-10-01,-0.743318,Biased


---

## NRC Lexical Corpus - Emotion Analysis

The NRC Emotion Lexicon is a collection of English words linked to eight primary emotions—anger, fear, anticipation, trust, surprise, sadness, joy, and disgust—as well as two sentiments, positive and negative. These annotations were crowdsourced through manual contributions.

Despite being nearly **15 years** old, its relevance remains highly regarded and respected among researchers. Numerous applied NLP studies over the past five years continue to incorporate it in their work.

**Credit**<br>
Mohammad, S., & Turney, P. (2010). Emotions evoked by common words and phrases: Using Mechanical Turk to create an emotion lexicon. In Proceedings of the NAACL HLT 2010 Workshop on Computational Approaches to Analysis and Generation of Emotion in Text (pp. 26–34). Association for Computational Linguistics. https://aclanthology.org/W10-0204

In [None]:
class TextEmotionAnalyzer:
    def __init__(self):
        """
        Initializes the TextEmotionAnalyzer class by loading the NRC Emotion Lexicon.
        The lexicon is pivoted to create an easy lookup structure for emotion associations.
        """
        # Load the NRC Emotion Lexicon from a specified path into a DataFrame.
        self.df_emotions = pd.read_csv(
            "Data\\NRC Word-Emotion Association Lexicon\\NRC-Sentiment-Emotion-Lexicons\\NRC-Emotion-Lexicon-v0.92\\NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
            names=["word", "emotion", "association"],  # Assign column names for better clarity.
            sep='\t'  # Specify that the file is tab-separated.
        )
        
        # Pivot the DataFrame to have words as index and emotions as columns, with associations as values.
        self.df_emotion_word = self.df_emotions.pivot(index='word', columns='emotion', values='association').fillna(0)
        
        # Get a list of all emotions from the pivoted DataFrame's columns.
        self.emotions = self.df_emotion_word.columns.tolist()
        
        # Initialize the Snowball Stemmer for English to help with word stemming.
        self.stemmer = SnowballStemmer("english")

    def get_emotion_scores(self, text):
        """
        Analyzes the input text and computes the emotion scores based on the NRC Emotion Lexicon.
        
        Parameters:
            text (str): The input text to analyze for emotions.
        
        Returns:
            dict: A dictionary with emotions as keys and their associated scores as values.
        """
        # Preprocess the input text by removing non-alphanumeric characters (keeping spaces).
        processed_text = re.sub(r"[^a-zA-Z0-9 ]+", '', text)
        
        # Tokenize the processed text into individual words and convert to lowercase.
        tokens = word_tokenize(processed_text.lower())
        
        # Initialize a dictionary to store emotion scores, starting at 0 for each emotion.
        emotion_scores = dict.fromkeys(self.emotions, 0)
        
        # Calculate emotion scores based on the tokens in the input text.
        for word in tokens:
            # Stem the current word to its root form for better matching with the lexicon.
            stemmed_word = self.stemmer.stem(word)
            
            # Check if the stemmed word exists in the emotion word DataFrame index.
            if stemmed_word in self.df_emotion_word.index:
                # Retrieve the emotion score for the stemmed word.
                emotion_score = self.df_emotion_word.loc[stemmed_word]
                
                # Update the total emotion scores for each emotion.
                for emotion in self.emotions:
                    emotion_scores[emotion] += emotion_score[emotion]

        # Normalize the emotion scores by the total number of words if the word count is greater than 0.
        word_count = len(tokens)
        if word_count > 0:
            emotion_scores = {emotion: score / word_count for emotion, score in emotion_scores.items()}
        else:
            # If no words are found, set all emotion scores to 0.
            emotion_scores = {emotion: 0 for emotion in emotion_scores}

        return emotion_scores


In [193]:
# Initialize the TextEmotionAnalyzer
analyzer = TextEmotionAnalyzer()

# Apply emotion analysis to each row's text and add results as new columns
emotion_columns = [f"{emotion}_emotion" for emotion in analyzer.emotions]

# Analyze each text and store the result in a new DataFrame
emotion_scores_df = filtered_df['text'].apply(analyzer.get_emotion_scores).apply(pd.Series)
emotion_scores_df.columns = emotion_columns

# Merge the new emotion scores with the original DataFrame
filtered_df = pd.concat([filtered_df, emotion_scores_df], axis=1)

In [22]:
filtered_df.sample(3)

Unnamed: 0,title,text,file_id,month,year,article_name,category,date,bias_score,bias_class,anger_emotion,anticipation_emotion,disgust_emotion,fear_emotion,joy_emotion,negative_emotion,positive_emotion,sadness_emotion,surprise_emotion,trust_emotion
238857,Family and personal life,President Alexander Lukashenka.]] While Presid...,27017765,10,2005,Vladimir Putin,Personal Details,2005-10-01,-0.833512,Biased,0.005076,0.020305,0.0,0.007614,0.007614,0.017766,0.043147,0.002538,0.007614,0.045685
198532,Public image,in Bishkek.]] Leonid Bershidsky analyzed Putin...,985853861,10,2020,Vladimir Putin,Public Imaage,2020-10-01,-0.710769,Biased,0.015138,0.020036,0.008905,0.020036,0.010686,0.034728,0.043188,0.014693,0.00935,0.033393
186957,Honours,"at their wedding, July ]] On July , Putin ma...",912352343,8,2019,Vladimir Putin,Recognition,2019-08-01,0.159827,Non-biased,0.005897,0.024431,0.000842,0.00674,0.022746,0.014322,0.057287,0.008425,0.012637,0.046335


---

## BERT Based **Political Leaning Identification**

**Reference**
1. Conference Proceedings <br>
Baly, R., Da San Martino, G., Glass, J., & Nakov, P. (2020). We can detect your bias: Predicting the political ideology of news articles. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) (pp. 4982–4991). Association for Computational Linguistics.

2. Article <br>
Bucket Research. (2023). Political bias classification using finetuned BERT model.

3. HuggingFace Repo Link <br>
https://huggingface.co/bucketresearch/politicalBiasBERT

__Month Wise Topic Modelling__ (Career, Policies, etc.)<br>

In [2]:
# Initialize the BERT model and tokenizer for sequence classification
tokenizer = AutoTokenizer.from_pretrained("bucketresearch/politicalBiasBERT")
model = AutoModelForSequenceClassification.from_pretrained("bucketresearch/politicalBiasBERT")

In [28]:
# Ensure test_political_lean returns exactly three values
def test_political_lean(text):
    max_length = 512

    # Tokenize text in chunks and store the results
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=max_length)
    chunked_logits = []

    # Process each chunk and aggregate logits
    for i in range(0, len(inputs['input_ids'][0]), max_length):
        chunk_input = {key: val[:, i:i + max_length] for key, val in inputs.items()}
        outputs = model(**chunk_input, labels=torch.tensor([0]))  # Adjust `labels` if needed
        _, logits = outputs[:2]
        chunked_logits.append(logits)

    # Aggregate results (e.g., averaging logits across chunks if it's classification)
    final_logits = torch.mean(torch.stack(chunked_logits), dim=0)
    loss, logits = outputs[:2]
    left, centre, right = logits.softmax(dim=-1)[0].tolist()  # Left, Center, Right
    
    # Return as a tuple
    return left, centre, right

In [29]:
# Applying function and store results in separate columns
filtered_df[['left_lean', 'center_lean', 'right_lean']] = filtered_df['text'].apply(
    lambda corpus: pd.Series(test_political_lean(corpus))
)

In [32]:
filtered_df.sample(3)

Unnamed: 0,title,text,file_id,month,year,article_name,category,date,bias_score,bias_class,...,fear_emotion,joy_emotion,negative_emotion,positive_emotion,sadness_emotion,surprise_emotion,trust_emotion,left_lean,center_lean,right_lean
240988,Quotations,One of Putin's favorite sports is the martial ...,17911161,6,2005,Vladimir Putin,Communications,2005-06-01,0.530233,Non-biased,...,0.013889,0.027778,0.013889,0.0625,0.013889,0.020833,0.041667,0.517628,0.261512,0.22086
85055,Early years and KGB career,30T23::45Z Krawndawg Mistranslation...? wiki...,209344256,4,2008,Vladimir Putin,Carreer Progression,2008-04-01,-0.70501,Biased,...,0.010309,0.012371,0.020619,0.053608,0.008247,0.012371,0.043299,0.020095,0.969789,0.010116
135860,Early years and KGB career,31T23::16Z Paul Pieniezny Deleting text by Ma...,168402573,10,2007,Vladimir Putin,Carreer Progression,2007-10-01,-0.730894,Biased,...,0.0,0.022222,0.0,0.055556,0.0,0.022222,0.033333,0.134062,0.853728,0.012211


### Exporting to Feather For Further Analysis/Visualisation

In [33]:
filtered_df.to_feather('Data/wikiarticles_opinion_mining_results.feather')

---