In [None]:
import numpy as np

PRE-PROCESSING OF THE DATAFRAME

In [None]:
# Select only the columns you want to include in the mean calculation along with 'PostID' for grouping
selected_columns = df[['PostID','Toxicity', 'Rationality', 'Mutual Respect', 'Emotion']]
# Group the selected columns by 'PostID' and calculate the mean for the numeric columns
average_scores = selected_columns.groupby('PostID').mean()
# Reset the index to make 'PostID' a column again
average_scores.reset_index(inplace=True)
new_df = average_scores

In [None]:
# Entropy calculation for weights distribution of the features
def entropy(column):
    _, counts = np.unique(column, return_counts=True)
    probabilities = counts / counts.sum()
    ent = -np.sum(probabilities * np.log2(probabilities))
    return ent

# Calculate the entropy for each feature
feature_entropies = {col: entropy(new_df[col]) for col in new_df.columns if col != 'PostID'}

# Invert the entropies to get weights (features with lower entropy get higher weight)
total_entropy = sum(feature_entropies.values())
weights = {col: (total_entropy - entropy) / total_entropy for col, entropy in feature_entropies.items()}

# Normalize the weights so they sum up to 1
weight_sum = sum(weights.values())
normalized_weights = {col: weight / weight_sum for col, weight in weights.items()}

# Calculate the scores using the normalized weights
def calculate_score(row, weights):
    weighted_sum = sum(row[col] * weights[col] for col in weights if col in row.index)
    return weighted_sum

scores = new_df.apply(lambda row: calculate_score(row, normalized_weights), axis=1)


In [None]:
new_df['Classification_Score'] = scores

In [None]:
new_df.head(10)

In [None]:
new_df.to_csv('conservative_scores.csv', index=False)

We try to create an innovative method for feature weight allocation based on the information entropy of each feature within the dataset. Our entropy calculation provides a quantitative measure of randomness or unpredictability in the data. The entropy for each feature extracted is measure of the inherent information content and variability.

A higher entropy value corresponds to a feature with greater dispersion or variability in its values, suggesting a more significant potential for contributing to the predictive power of a model whether the post is combative or deliberative.

To do this, we inverted the entropy values to prioritize features with lower entropy—hence presumed stability or consistency—and allocated higher weights to them. The underlying assumption is that features with lower variability (yet non-zero information content) may offer more reliable signals for prediction.

Post-inversion, the weights were normalized across all features to ensure they summed to unity, maintaining a probabilistic interpretation. These normalized weights were then used in a weighted summation of feature values to compute a composite score. This score integrates the distinct contributions of each feature of the posts, adjusted for their respective entropy-derived importance, providing a nuanced approach to feature integration for predicting where it sits on the deliberativ-combative scale.


In [None]:
# to make it easier, I put all the code into a function so you can run it for each subreddit directory
def process_subreddit_directory(folder_path):
    # Function to parse comments and replies with CommentIDs
    def parse_comments(comments, post_id, parent_id, depth, result, parent_counter=0):
        for comment_index, comment in enumerate(comments):
            # If it's a top-level comment, its ID is sth like post_1_0. Otherwise, it includes the parent's counter.
            if depth == 1:  # Directly under post
                comment_id = f"{post_id}_{parent_counter + comment_index}"
            else:  # Nested comment/reply
                comment_id = f"{parent_id}_{comment_index + 1}"

            result.append({
                'PostID': post_id,
                'CommentID': comment_id,
                'ParentID': parent_id,
                'Author': comment['author'],
                'Text': comment['body'],
                'Depth': depth
            })
            if 'replies' in comment and comment['replies']:
                # For replies, increment the parent counter for each new comment
                parse_comments(comment['replies'], post_id, comment_id, depth + 1, result, 0)

    # Function to process files in each subreddit directory
    def process_reddit_posts(folder_path):
        data = []
        post_counter = 1  

        for file_path in glob.glob(os.path.join(folder_path, '*.json')):
            with open(file_path, 'r') as file:
                post_data = json.load(file)
                post_id = f"post_{post_counter}"  # e.g., post_1
                post_counter += 1  

                # Initially, add the post itself with a basic CommentID and no ParentID
                data.append({
                    'PostID': post_id,
                    'CommentID': f"{post_id}_0",
                    'ParentID': None,
                    'Author': 'NONE',  # Placeholder
                    'Text': post_data['title'],
                    'Depth': 0
                })

                # Process comments, starting with depth=1
                if 'comments' in post_data:
                    parse_comments(post_data['comments'], post_id, f"{post_id}_0", 1, data)

        return pd.DataFrame(data)

    # Function to modify mutual respect score
    def modify_mutual_respect(row):
        label = row['Mutual Respect']['label']
        score = row['Mutual Respect']['score']

        if label == 'NEUTRAL':
            return score * 0.35
        elif label == 'POLITE':
            return score
        elif label == 'SOMEWHAT_POLITE':
            return score * 0.7
        else:
            return 1 - score

    # Function to calculate entropy
    def entropy(column):
        _, counts = np.unique(column, return_counts=True)
        probabilities = counts / counts.sum()
        ent = -np.sum(probabilities * np.log2(probabilities))
        return ent

    # Function to calculate the scores using the normalized weights
    def calculate_score(row, weights):
        weighted_sum = sum(row[col] * weights[col] for col in weights if col in row.index)
        return weighted_sum
    
    # Adding the emotion score of the text
    def predict_emotion(text):
        classifier = TextClassifier.load('en-sentiment')
        sentence = Sentence(text)
        classifier.predict(sentence)
        # if the value is NEGATIVE then we take - value if the value is POSITIVE then we just keep that value
        if sentence.labels[0].value == 'NEGATIVE':
            return sentence.labels[0].score* -1
        else:
            return sentence.labels[0].score

    # Load all the models and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("NOVA-vision-language/polite_bert")
    respect_model = AutoModelForSequenceClassification.from_pretrained("NOVA-vision-language/polite_bert")
    classifier = pipeline('text-classification', model=respect_model, tokenizer=tokenizer)
    detoxify_model = Detoxify('original')
    sentiment_classifier = TextClassifier.load('en-sentiment')

    # Process subreddit posts
    df = process_reddit_posts(folder_path)

    # Calculate mutual respect score
    df['Mutual Respect'] = classifier(df['Text'].tolist())
    df['Mutual Respect'] = df.apply(modify_mutual_respect, axis=1)
    
    # Calculate toxicity score
    text_to_toxicity = df['Text'].astype(str).tolist()
    detoxify_scores = detoxify_model.predict(text_to_toxicity)['toxicity']
    df['Toxicity'] = np.sqrt(detoxify_scores)

    # Calculate rationality score
    tokenizer = AutoTokenizer.from_pretrained("d4data/bias-detection-model")
    model = TFAutoModelForSequenceClassification.from_pretrained("d4data/bias-detection-model")
    classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)
    df['Rationality'] = classifier(df['Text'].tolist())
    df['Rationality'] = df['Rationality'].apply(lambda x: x['score'] if x['label'] == 'Non-biased' else 1 - x['score'])

    emotion = df['Text'].apply(predict_emotion)
    scaled_emotion = (emotion+1)/2 # scale the emotion scores to be between 0 and 1
    df['Emotion'] = scaled_emotion

    # Select only the columns you want to include in the mean calculation along with 'PostID' for grouping
    selected_columns = df[['PostID', 'Toxicity', 'Rationality', 'Mutual Respect', 'Emotion']]

    # Group the selected columns by 'PostID' and calculate the mean for the numeric columns
    average_scores = selected_columns.groupby('PostID').mean()

    # Reset the index to make 'PostID' a column again
    average_scores.reset_index(inplace=True)

    # Calculate the entropy for each feature
    feature_entropies = {col: entropy(average_scores[col]) for col in average_scores.columns if col != 'PostID'}

    # Invert the entropies to get weights (features with lower entropy get higher weight)
    total_entropy = sum(feature_entropies.values())
    weights = {col: (total_entropy - entropy) / total_entropy for col, entropy in feature_entropies.items()}

    # Normalize the weights so they sum up to 1
    weight_sum = sum(weights.values())
    normalized_weights = {col: weight / weight_sum for col, weight in weights.items()}

    # Calculate the scores using the normalized weights
    scores = average_scores.apply(lambda row: calculate_score(row, normalized_weights), axis=1)

    # Add the scores to the dataframe
    average_scores['Type'] = scores

    return average_scores


In [None]:
subreddit1_path = './Scripts/Political/Conservative_data'
subreddit2_path = './Scripts/Political/AskALiberal_data'
subreddit3_path = './Scripts/Political/NeutralPolitics_data'

subreddit1_scores = process_subreddit_directory(subreddit1_path)
subreddit2_scores = process_subreddit_directory(subreddit2_path)
subreddit3_scores = process_subreddit_directory(subreddit3_path)

create a dataframe with all the scores appended
for each post id in the df add where it came from to the id, so you can differentiate 
between the posts from different subreddits
subreddit1_scores['PostID'] = 'Conservative_' + subreddit1_scores['PostID']
subreddit2_scores['PostID'] = 'Liberal_' + subreddit2_scores['PostID']
subreddit3_scores['PostID'] = 'Neutral_' + subreddit3_scores['PostID']

all_scores = pd.concat([subreddit1_scores, subreddit2_scores, subreddit3_scores])