## Notebook: Rule based analysis

This notebook explores some rule-based and syntactic techniques. We'll use these methods to uncover patterns in complexity, theory usage, and framing in the academic literature.

In [1]:
import os
import pandas as pd

In [2]:
df = pd.read_csv("../data/merged_scopus.csv")

valid_types = ['Article', 'Book', 'Book chapter', 'Review']
df = df[df['Document Type'].isin(valid_types)]

df['text'] = df['Title'].fillna('') + '. ' + df['Abstract'].fillna('')

### Examine sentence complexity and readability

In [3]:
import textstat

df['readability'] = df['text'].apply(textstat.flesch_reading_ease)
df['avg_sent_len'] = df['text'].apply(lambda x: sum(len(sent.split()) for sent in x.split('.') if sent.strip()) / max(1, x.count('.')))
df[['readability', 'avg_sent_len']].describe()

Unnamed: 0,readability,avg_sent_len
count,5039.0,5039.0
mean,16.000938,20.575191
std,12.514406,4.504787
min,-44.856591,5.958904
25%,7.892368,17.625
50%,16.793442,20.285714
75%,24.787231,23.2
max,62.816136,54.125


### Complexity by journal

In [4]:
# Get the top 15 most common journals in the dataset
top_sources = df['Source title'].value_counts().head(15).index

df_top = df[df['Source title'].isin(top_sources)]

readability_summary = df_top.groupby('Source title')[['readability', 'avg_sent_len']].mean()
readability_summary = readability_summary.sort_values('readability', ascending=False)
readability_summary

Unnamed: 0_level_0,readability,avg_sent_len
Source title,Unnamed: 1_level_1,Unnamed: 2_level_1
Journal of Medical Internet Research,22.941082,19.655134
International Journal of Environmental Research and Public Health,21.51042,19.707787
Expert Systems with Applications,18.532374,21.789061
Frontiers in Public Health,18.180444,19.78437
International Journal of Advanced Computer Science and Applications,16.879073,17.983764
PLoS ONE,16.285762,21.101791
Applied Sciences (Switzerland),14.208316,20.366472
Sustainability (Switzerland),13.902759,21.04616
British Journal of Educational Technology,13.505625,21.911533
Technological Forecasting and Social Change,13.103491,21.677279


### Detecting theories using dictionary matching

In [5]:
import re

# Load the theory dictionary
# Each row in the CSV should look like: Theory, Term
theories_df = pd.read_csv('../data/theories.csv')

# Create a mapping of term → theory
theory_terms = {}
for _, row in theories_df.iterrows():
    theory = row['Theory']
    terms = row['Term'].split('; ')  # Assuming terms are separated by "; "
    for term in terms:
        theory_terms[term.lower()] = theory

# Compile regex patterns with word boundaries for accurate matches
compiled_patterns = {
    re.compile(r'\b{}\b'.format(re.escape(term)), re.IGNORECASE): theory
    for term, theory in theory_terms.items()
}

# Define a function to find which theories are mentioned
def find_theories(text):
    if not isinstance(text, str):
        return None
    matched = set()
    for pattern, theory in compiled_patterns.items():
        if pattern.search(text):
            matched.add(theory)
    return '; '.join(sorted(matched)) if matched else None

# Apply it to the dataset
df['Theory'] = df['text'].apply(find_theories)

# Preview
df[['text', 'Theory']].dropna().head()

Unnamed: 0,text,Theory
148,The impact of LLM chatbots on learning outcome...,complexity theory
314,Research on the Impact of the Synergy Between ...,policy feedback theory
338,Central bank mandates and monetary policy stan...,discourse theory
363,Proposed design of an augmented deep learning ...,complexity theory
374,Image captioning with residual swin transforme...,network theory


In [6]:
theory_count = df['Theory'].str.split(';').explode().str.strip().value_counts().reset_index()

# Renaming columns for better readability
theory_count.column = ['Theory', 'Count']

# Display the resulting dataframe
print(theory_count)

                             Theory  count
0                  discourse theory     27
1                    network theory     27
2                 complexity theory     16
3            policy feedback theory      9
4              institutional theory      7
5   diffusion of innovations theory      6
6        narrative policy framework      5
7                       game theory      5
8        multiple streams framework      4
9     punctuated equilibrium theory      4
10                   systems theory      3
11    social construction framework      3
12                   framing theory      1
13                  critical theory      1
14             public choice theory      1
15     advocacy coalition framework      1
16            social capital theory      1
17                  prospect theory      1
18          ecology of games theory      1


  theory_count.column = ['Theory', 'Count']


### Detecting normative language (modal verbs)

In [7]:
#Can be implemented as a rule-based approach, but here we demonstrate its use with a pre-trained model

import spacy
nlp = spacy.load("en_core_web_sm")

def count_modals(text):
    doc = nlp(text)
    return sum(1 for token in doc if token.tag_ == 'MD')

df['modal_count'] = df['text'].apply(count_modals)
df['modal_count'].describe()

  from .autonotebook import tqdm as notebook_tqdm


count    5039.000000
mean        1.265330
std         1.562528
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max        19.000000
Name: modal_count, dtype: float64

In [8]:
# Normativity by journal (top 10 by average modal count)
df_top = df[df['Source title'].isin(top_sources)]
df_top.groupby('Source title')['modal_count'].mean().sort_values(ascending=False).head(10)

Source title
British Journal of Educational Technology                              3.918919
Journal of Medical Internet Research                                   2.048077
Expert Systems with Applications                                       1.521739
Technological Forecasting and Social Change                            1.326531
Applied Sciences (Switzerland)                                         1.303030
International Journal of Environmental Research and Public Health      1.282609
Heliyon                                                                1.240000
PLoS ONE                                                               1.225000
Frontiers in Public Health                                             1.142857
International Journal of Advanced Computer Science and Applications    1.125000
Name: modal_count, dtype: float64