In [None]:
## This code reads in research pdfs and does a dictionary based word-count of methodological categories
## It assumes:
    ## there is one research article per pdf
    ## the first four characters of the pdf file name encode the publication year
    ## pdfs are grouped by journal, and stored in a journal specific directory

# Dependencies
# PyPDF2
# Pandas
# Matplotlib

# Import libraries
import os
import pandas as pd
from PyPDF2 import PdfReader
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
# Specify pdf locations

mp_pdf_directory = '/home/rwhuskey/ucd/studies/oxford_handbook_chapter_cognition/code/media_psychology_all'
jmp_pdf_directory = '/home/rwhuskey/ucd/studies/oxford_handbook_chapter_cognition/code/jmp_all'
ppm_pdf_directory = '/home/rwhuskey/ucd/studies/oxford_handbook_chapter_cognition/code/ppm_all'

In [None]:
# Create empty dataframe to store pdfs
mp_dfs = []
jmp_dfs = []
ppm_dfs = []

In [None]:
# Iterate over the pdfs in the Media Psychology directory

for filename in os.listdir(mp_pdf_directory):
    if filename.endswith('.pdf'):
        mp_pdf_path = os.path.join(mp_pdf_directory, filename)

        # Open the pdf file
        with open(mp_pdf_path, 'rb') as file:
            # Initialize the pdf reader
            mp_pdf_reader = PdfReader(file)

            # Extract text from each page of the pdf
            mp_text = ''
            for page in mp_pdf_reader.pages:
                mp_text += page.extract_text()

            # Create a dataframe with the extracted text
            #df = pd.DataFrame({'Text': [text]})
            
            # Create a dataframe with the extracted text and year
            mp_df = pd.DataFrame({'Text': [mp_text], 'Year': [filename[:4]]})

            # Append the dataframe to the list
            mp_dfs.append(mp_df)
    
# Concatenate all dataframes in the list
final_mp_df = pd.concat(mp_dfs, ignore_index=True)

In [None]:
# Inspect the contents of final_mp_df
print(final_mp_df.head())  # Print the first few rows of the DataFrame
print(final_mp_df.info())  # Print the summary information of the DataFrame

In [None]:
# Iterate over the pdfs in the Journal of Media Psychology directory

for filename in os.listdir(jmp_pdf_directory):
    if filename.endswith('.pdf'):
        jmp_pdf_path = os.path.join(jmp_pdf_directory, filename)

        # Open the pdf file
        with open(jmp_pdf_path, 'rb') as file:
            # Initialize the pdf reader
            jmp_pdf_reader = PdfReader(file)

            # Extract text from each page of the pdf
            jmp_text = ''
            for page in jmp_pdf_reader.pages:
                jmp_text += page.extract_text()

            # Create a dataframe with the extracted text
            #df = pd.DataFrame({'Text': [text]})
            
            # Create a dataframe with the extracted text and year
            jmp_df = pd.DataFrame({'Text': [jmp_text], 'Year': [filename[:4]]})

            # Append the dataframe to the list
            jmp_dfs.append(jmp_df)
    
# Concatenate all dataframes in the list
final_jmp_df = pd.concat(jmp_dfs, ignore_index=True)

In [None]:
# Inspect the contents of final_jmp_df
print(final_jmp_df.head())  # Print the first few rows of the DataFrame
print(final_jmp_df.info())  # Print the summary information of the DataFrame

In [None]:
# Iterate over the pdfs in the Psychology of Popular Media directory

for filename in os.listdir(ppm_pdf_directory):
    if filename.endswith('.pdf'):
        ppm_pdf_path = os.path.join(ppm_pdf_directory, filename)

        # Open the pdf file
        with open(ppm_pdf_path, 'rb') as file:
            # Initialize the pdf reader
            ppm_pdf_reader = PdfReader(file)

            # Extract text from each page of the pdf
            ppm_text = ''
            for page in ppm_pdf_reader.pages:
                ppm_text += page.extract_text()

            # Create a dataframe with the extracted text
            #df = pd.DataFrame({'Text': [text]})
            
            # Create a dataframe with the extracted text and year
            ppm_df = pd.DataFrame({'Text': [ppm_text], 'Year': [filename[:4]]})

            # Append the dataframe to the list
            ppm_dfs.append(ppm_df)
    
# Concatenate all dataframes in the list
final_ppm_df = pd.concat(ppm_dfs, ignore_index=True)

In [None]:
# Inspect the contents of final_ppm_df
print(final_ppm_df.head())  # Print the first few rows of the DataFrame
print(final_ppm_df.info())  # Print the summary information of the DataFrame

In [None]:
# Create a new dataframe to store study measures
#measures_df = pd.DataFrame(columns=["study key", "self_report", "behavior", "neurophysio", "year"])
mp_measures_df = pd.DataFrame(columns=["self_report", "behavior", "neurophysio", "year"])
jmp_measures_df = pd.DataFrame(columns=["self_report", "behavior", "neurophysio", "year"])
ppm_measures_df = pd.DataFrame(columns=["self_report", "behavior", "neurophysio", "year"])

In [None]:
# Define the words to search for
self_search_phrases = ["self-report", "scale", "think aloud", "rate", "rated", "Likert", "thought listing",
                      "thought-listing"]
behavior_search_phrases = ["signal detection", "d-prime", "STRT", "secondary task reaction time", " RT ",
                            "Reaction Time", "mental rotation", "kills-per-round", "kills per round",
                          "task performance", "task-performance", "Weak-Link Coordination Exercise",
                           "Weak Link Coordination Exercise", "Implicit Association Test", "Implicit Attitude",
                          "Implicit Attitudes", "Helping Behavior", "Helping-Behaviors", "decision time",
                           "response inhibition", "recognition test", "recognition", "continuous response measure",
                          "continuous response measures", "tangram", "BeanFest", "Aggressive Behavior",
                           "anagram task", "posted a comment", "wrote a comment", "write a comment"]
physio_search_phrases = [" ECG ", " EKG ", "Electrocardiogram" " EMG ", "Electromyography", "orbicularis oculi",
                         "corrugator supercilli", "Skin Conductance", "Heart Rate", "Heart Rate Variability",
                         " EEG ", "Electroencephalogram", "Event Related Potential", "electrodermal activity"
                         " fMRI ", "functional magnetic resonance imaging", "Eye Tracking", "Eye-Tracking",
                         "Dwell Time", "Dwell-Time", "Eye Gaze", "Eye-Gaze"]
survey_search_phrases = ["survey experiment", "online experiment", "qualtrics", "mturk", "prolific academic"]

In [None]:
# Iterate over each row in final_mp_df
for index, row in final_mp_df.iterrows():
    mp1_text = row['Text']
    mp1_year = row['Year']
  
    # Check for self_report phrases
    if any(phrase in mp1_text for phrase in self_search_phrases):
        mp_measures_df.loc[index, 'self_report'] = 1
    else:
        mp_measures_df.loc[index, 'self_report'] = 0
  
    # Check for behavior phrases
    if any(phrase in mp1_text for phrase in behavior_search_phrases):
        mp_measures_df.loc[index, 'behavior'] = 1
    else:
        mp_measures_df.loc[index, 'behavior'] = 0
  
    # Check for neurophysio phrases
    if any(phrase in mp1_text for phrase in physio_search_phrases):
        mp_measures_df.loc[index, 'neurophysio'] = 1
    else:
        mp_measures_df.loc[index, 'neurophysio'] = 0
  
    # Set the year
    mp_measures_df.loc[index, 'year'] = mp1_year

In [None]:
# Look at the results
print(mp_measures_df)

In [None]:
# Iterate over each row in final_jmp_df
for index, row in final_jmp_df.iterrows():
    jmp1_text = row['Text']
    jmp1_year = row['Year']
  
    # Check for self_report phrases
    if any(phrase in jmp1_text for phrase in self_search_phrases):
        jmp_measures_df.loc[index, 'self_report'] = 1
    else:
        jmp_measures_df.loc[index, 'self_report'] = 0
  
    # Check for behavior phrases
    if any(phrase in jmp1_text for phrase in behavior_search_phrases):
        jmp_measures_df.loc[index, 'behavior'] = 1
    else:
        jmp_measures_df.loc[index, 'behavior'] = 0
  
    # Check for neurophysio phrases
    if any(phrase in jmp1_text for phrase in physio_search_phrases):
        jmp_measures_df.loc[index, 'neurophysio'] = 1
    else:
        jmp_measures_df.loc[index, 'neurophysio'] = 0
  
    # Set the year
    jmp_measures_df.loc[index, 'year'] = jmp1_year

In [None]:
# Look at the results
print(jmp_measures_df)

In [None]:
# Iterate over each row in final_ppm_df
for index, row in final_ppm_df.iterrows():
    ppm1_text = row['Text']
    ppm1_year = row['Year']
  
    # Check for self_report phrases
    if any(phrase in ppm1_text for phrase in self_search_phrases):
        ppm_measures_df.loc[index, 'self_report'] = 1
    else:
        ppm_measures_df.loc[index, 'self_report'] = 0
  
    # Check for behavior phrases
    if any(phrase in ppm1_text for phrase in behavior_search_phrases):
        ppm_measures_df.loc[index, 'behavior'] = 1
    else:
        ppm_measures_df.loc[index, 'behavior'] = 0
  
    # Check for neurophysio phrases
    if any(phrase in ppm1_text for phrase in physio_search_phrases):
        ppm_measures_df.loc[index, 'neurophysio'] = 1
    else:
        ppm_measures_df.loc[index, 'neurophysio'] = 0
  
    # Set the year
    ppm_measures_df.loc[index, 'year'] = ppm1_year

In [None]:
# Look at the results
print(ppm_measures_df)

In [None]:
# Plot Media Psychology Measures in Total

# Calculate the count of each category
counts_mp = mp_measures_df[['self_report', 'behavior', 'neurophysio']].sum()

# Update the category labels
category_labels = ['Self-Report', 'Behavior', 'Neurophysiological']

# Create the bar chart
plt.bar(category_labels, counts_mp.values)

# Add labels and title
#plt.xlabel('Measures')
plt.ylabel('Frequency Count')
plt.title('Frequency Count of Measures For all 285 Research Articles \nPublished in Media Psychology from 2013 - 2022')

# Save the plot as a JPEG image
plt.savefig('mp_plot.jpg', format='jpeg', dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Create a DataFrame for the counts
table_data_mp = {'Category': category_labels, 'Count': counts_mp.values}
results_table_mp = pd.DataFrame(table_data_mp)

# Display the table
print(results_table_mp)

In [None]:
# Plot Journal of Media Psychology Measures in Total

# Calculate the count of each category
counts_jmp = jmp_measures_df[['self_report', 'behavior', 'neurophysio']].sum()

# Update the category labels
category_labels = ['Self-Report', 'Behavior', 'Neurophysiological']

# Create the bar chart
plt.bar(category_labels, counts_jmp.values)

# Add labels and title
#plt.xlabel('Measures')
plt.ylabel('Frequency Count')
plt.title('Frequency Count of Measures For all 189 Research Articles \nPublished in Journal of Media Psychology from 2013 - 2022')

# Save the plot as a JPEG image
plt.savefig('jmp_plot.jpg', format='jpeg', dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Create a DataFrame for the counts
table_data_jmp = {'Category': category_labels, 'Count': counts_jmp.values}
results_table_jmp = pd.DataFrame(table_data_jmp)

# Display the table
print(results_table_jmp)

In [None]:
# Plot Psychology of Popular Media Measures in Total

# Calculate the count of each category
counts_ppm = ppm_measures_df[['self_report', 'behavior', 'neurophysio']].sum()

# Update the category labels
category_labels = ['Self-Report', 'Behavior', 'Neurophysiological']

# Create the bar chart
plt.bar(category_labels, counts_ppm.values)

# Add labels and title
#plt.xlabel('Measures')
plt.ylabel('Frequency Count')
plt.title('Frequency Count of Measures For all 383 Research Articles \nPublished in Psychology of Popular Media from 2013 - 2022')

# Save the plot as a JPEG image
plt.savefig('ppm_plot.jpg', format='jpeg', dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Create a DataFrame for the counts
table_data_ppm = {'Category': category_labels, 'Count': counts_ppm.values}
results_table_ppm = pd.DataFrame(table_data_ppm)

# Display the table
print(results_table_ppm)

In [None]:
# Group the dataframe by year and calculate the sum for each column for Media Psychology
grouped_df_mp = mp_measures_df.groupby('year').sum()

# Plot the line chart
plt.plot(grouped_df_mp.index, grouped_df_mp['self_report'], label='Self Report')
plt.plot(grouped_df_mp.index, grouped_df_mp['behavior'], label='Behavior')
plt.plot(grouped_df_mp.index, grouped_df_mp['neurophysio'], label='Neurophysio')

# Set the x-axis label and title
plt.xlabel('Publication Year')
plt.ylabel('Frequency Count')
plt.title('Frequency Count of Measures For all 285 Research Articles \nPublished in Media Psychology For Each Year From 2013 - 2022')

# Update the category labels
category_labels = ['Self-Report', 'Behavior', 'Neurophysiological']

# Add a legend
plt.legend()

# Save the plot as a JPEG image
plt.savefig('mp_plot_year.jpg', format='jpeg', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
# Group the dataframe by year and calculate the sum for each column for Journal of Media Psychology
grouped_df_jmp = jmp_measures_df.groupby('year').sum()

# Plot the line chart
plt.plot(grouped_df_jmp.index, grouped_df_jmp['self_report'], label='Self Report')
plt.plot(grouped_df_jmp.index, grouped_df_jmp['behavior'], label='Behavior')
plt.plot(grouped_df_jmp.index, grouped_df_jmp['neurophysio'], label='Neurophysio')

# Set the x-axis label and title
plt.xlabel('Publication Year')
plt.ylabel('Frequency Count')
plt.title('Frequency Count of Measures For all 198 Research Articles \nPublished in Journal of Media Psychology For Each Year From 2013 - 2022')

# Update the category labels
category_labels = ['Self-Report', 'Behavior', 'Neurophysiological']

# Add a legend
plt.legend()

# Save the plot as a JPEG image
plt.savefig('jmp_plot_year.jpg', format='jpeg', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
# Group the dataframe by year and calculate the sum for each column for Psychology of Popular Media
grouped_df_ppm = ppm_measures_df.groupby('year').sum()

# Plot the line chart
plt.plot(grouped_df_ppm.index, grouped_df_ppm['self_report'], label='Self Report')
plt.plot(grouped_df_ppm.index, grouped_df_ppm['behavior'], label='Behavior')
plt.plot(grouped_df_ppm.index, grouped_df_ppm['neurophysio'], label='Neurophysio')

# Set the x-axis label and title
plt.xlabel('Publication Year')
plt.ylabel('Frequency Count')
plt.title('Frequency Count of Measures For all 383 Research Articles \nPublished in Psychology of Popular Media For Each Year From 2013 - 2022')

# Update the category labels
category_labels = ['Self-Report', 'Behavior', 'Neurophysiological']

# Add a legend
plt.legend()

# Save the plot as a JPEG image
plt.savefig('ppm_plot_year.jpg', format='jpeg', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl

# Increase font size
mpl.rcParams.update({'font.size': 18})

# Create a figure and subplots
fig, axes = plt.subplots(3, 3, figsize=(18, 20))  # Increase figure size

# Plot 1: Media Psychology Measures in Total
counts_mp = mp_measures_df[['self_report', 'behavior', 'neurophysio']].sum()
category_labels = ['Self-Report', 'Behavior', 'Neurophysiological']
axes[0, 0].bar(category_labels, counts_mp.values)
axes[0, 0].set_ylim([0, 400])  # Set vertical axis limits
axes[0, 0].set_ylabel('Frequency Count')
axes[0, 0].set_title('Measures in Total\nMedia Psychology ')
axes[0, 0].tick_params(axis='x', rotation=45)  # Rotate x-axis labels

# Plot 2: Journal of Media Psychology Measures in Total
counts_jmp = jmp_measures_df[['self_report', 'behavior', 'neurophysio']].sum()
axes[0, 1].bar(category_labels, counts_jmp.values)
axes[0, 1].set_ylim([0, 400])  # Set vertical axis limits
axes[0, 1].set_ylabel('Frequency Count')
axes[0, 1].set_title('Measures in Total\nJournal of Media Psychology ')
axes[0, 1].tick_params(axis='x', rotation=45)  # Rotate x-axis labels

# Plot 3: Psychology of Popular Media Measures in Total
counts_ppm = ppm_measures_df[['self_report', 'behavior', 'neurophysio']].sum()
axes[0, 2].bar(category_labels, counts_ppm.values)
axes[0, 2].set_ylim([0, 400])  # Set vertical axis limits
axes[0, 2].set_ylabel('Frequency Count')
axes[0, 2].set_title('Measures in Total\nPsychology of Popular Media ')
axes[0, 2].tick_params(axis='x', rotation=45)  # Rotate x-axis labels

# Plot 4: Media Psychology Measures by Year
grouped_df_mp = mp_measures_df.groupby('year').sum()
axes[1, 0].plot(grouped_df_mp.index, grouped_df_mp['self_report'], label='Self Report')
axes[1, 0].plot(grouped_df_mp.index, grouped_df_mp['behavior'], label='Behavior')
axes[1, 0].plot(grouped_df_mp.index, grouped_df_mp['neurophysio'], label='Biological')
axes[1, 0].set_ylim([0, 80])  # Set vertical axis limits
axes[1, 0].set_xlabel('Publication Year')
axes[1, 0].set_ylabel('Frequency Count')
axes[1, 0].set_title('Measures by Year\nMedia Psychology ')
axes[1, 0].tick_params(axis='x', rotation=45)  # Rotate x-axis labels
axes[1, 0].legend()

# Plot 5: Journal of Media Psychology Measures by Year
grouped_df_jmp = jmp_measures_df.groupby('year').sum()
axes[1, 1].plot(grouped_df_jmp.index, grouped_df_jmp['self_report'], label='Self Report')
axes[1, 1].plot(grouped_df_jmp.index, grouped_df_jmp['behavior'], label='Behavior')
axes[1, 1].plot(grouped_df_jmp.index, grouped_df_jmp['neurophysio'], label='Biological')
axes[1, 1].set_ylim([0, 80])  # Set vertical axis limits
axes[1, 1].set_xlabel('Publication Year')
axes[1, 1].set_ylabel('Frequency Count')
axes[1, 1].set_title('Measures by Year\nJournal of Media Psychology ')
axes[1, 1].tick_params(axis='x', rotation=45)  # Rotate x-axis labels
axes[1, 1].legend()

# Plot 6: Psychology of Popular Media Measures by Year
grouped_df_ppm = ppm_measures_df.groupby('year').sum()
axes[1, 2].plot(grouped_df_ppm.index, grouped_df_ppm['self_report'], label='Self Report')
axes[1, 2].plot(grouped_df_ppm.index, grouped_df_ppm['behavior'], label='Behavior')
axes[1, 2].plot(grouped_df_ppm.index, grouped_df_ppm['neurophysio'], label='Biological')
axes[1, 2].set_ylim([0, 80])  # Set vertical axis limits
axes[1, 2].set_xlabel('Publication Year')
axes[1, 2].set_ylabel('Frequency Count')
axes[1, 2].set_title('Measures by Year\nPsychology of Popular Media ')
axes[1, 2].tick_params(axis='x', rotation=45)  # Rotate x-axis labels
axes[1, 2].legend()

# Plot 7: Total Number of Articles Published per Journal by Year
journal_labels = ['Media Psychology', 'Journal of Media Psychology', 'Psychology of Popular Media']
total_counts = [mp_measures_df['year'].value_counts().sort_index(),
                jmp_measures_df['year'].value_counts().sort_index(),
                ppm_measures_df['year'].value_counts().sort_index()]
for i, counts in enumerate(total_counts):
    axes[2, i].plot(counts.index, counts.values)
    axes[2, i].set_ylim([0, 80])  # Set vertical axis limits
    axes[2, i].set_xlabel('Publication Year')
    axes[2, i].set_ylabel('Article Count')
    axes[2, i].set_title(f'Total Articles\n{journal_labels[i]}')
    axes[2, i].tick_params(axis='x', rotation=45)  # Rotate x-axis labels

# Adjust spacing between subplots
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Add more space at the top for plot titles

# Save the plot as a JPEG image
plt.savefig('combined_plot.jpg', format='jpeg', dpi=300, bbox_inches='tight')

# Show the combined plot
plt.show()


In [None]:
# Sanity check. Are all the pdfs being read in correctly?
# Show the first 1000 words of text in each pdf

import re

for i, mp_df in enumerate(mp_dfs):
    print(f"DataFrame {i+1}:")
    
    # Extract the first 100 words from each dataframe
    text = mp_df.iloc[0]['Text']
    words = re.findall(r'\b\w+\b', text)
    first_1000_words = ' '.join(words[:1000])
    
    print(first_1000_words)
    print('\n')

In [None]:
# Sanity check. Are all the pdfs being read in correctly?
# Show the first 1000 words of text in each pdf

for i, jmp_df in enumerate(jmp_dfs):
    print(f"DataFrame {i+1}:")
    
    # Extract the first 100 words from each dataframe
    text = jmp_df.iloc[0]['Text']
    words = re.findall(r'\b\w+\b', text)
    first_1000_words = ' '.join(words[:1000])
    
    print(first_1000_words)
    print('\n')

In [None]:
# Sanity check. Are all the pdfs being read in correctly?
# Show the first 1000 words of text in each pdf

for i, ppm_df in enumerate(ppm_dfs):
    print(f"DataFrame {i+1}:")
    
    # Extract the first 100 words from each dataframe
    text = ppm_df.iloc[0]['Text']
    words = re.findall(r'\b\w+\b', text)
    first_1000_words = ' '.join(words[:1000])
    
    print(first_1000_words)
    print('\n')

In [None]:
## Do some analyses for reliability between automated and manual content analysis

In [None]:
# Sum counts by "Self-Report", "Behavior", "Neurophysiological" within each year
sum_table = jmp_measures_df.groupby('year')['self_report', 'behavior', 'neurophysio'].sum().reset_index()

# Display the table
print(sum_table)

In [None]:
jmp_measures_df_reliability = pd.DataFrame(columns=["self_report", "behavior", "neurophysio", "year", "text"])

In [None]:
# Iterate over each row in final_jmp_df
for index, row in final_jmp_df.iterrows():
    jmp1_text = row['Text']
    jmp1_year = row['Year']
  
    # Check for self_report phrases
    if any(phrase in jmp1_text for phrase in self_search_phrases):
        jmp_measures_df_reliability.loc[index, 'self_report'] = 1
    else:
        jmp_measures_df_reliability.loc[index, 'self_report'] = 0
  
    # Check for behavior phrases
    if any(phrase in jmp1_text for phrase in behavior_search_phrases):
        jmp_measures_df_reliability.loc[index, 'behavior'] = 1
    else:
        jmp_measures_df_reliability.loc[index, 'behavior'] = 0
  
    # Check for neurophysio phrases
    if any(phrase in jmp1_text for phrase in physio_search_phrases):
        jmp_measures_df_reliability.loc[index, 'neurophysio'] = 1
    else:
        jmp_measures_df_reliability.loc[index, 'neurophysio'] = 0
  
    # Set the year
    jmp_measures_df_reliability.loc[index, 'year'] = jmp1_year
    
    # Set Text
    jmp_measures_df_reliability.loc[index, 'text'] = jmp1_text

In [None]:
# Filter the DataFrame for year = 2022
filtered_results = jmp_measures_df_reliability[jmp_measures_df_reliability['year'] == '2022']

# Look at the filtered results
print(filtered_results)

In [None]:
# Filter the DataFrame for year = 2022
filtered_results = jmp_measures_df_reliability[jmp_measures_df_reliability['year'] == '2022']

# Export filtered results to a CSV file
filtered_results.to_csv('filtered_results.csv', index=False)