In [8]:
import os
import pdfplumber
import pandas as pd
from difflib import SequenceMatcher

file_path = '../../1_scraping/Scrapes_ALL.csv'
Scrape_df = pd.read_csv(file_path)
#Scrape_df.head()

# Directory where the PDFs are stored
pdf_dir = '../ScienceDirectPDF'

# Function to extract text from a PDF file using pdfplumber
def extract_text_from_pdf(pdf_file):
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() if page.extract_text() else ''
            return text
    except Exception as e:
        print(f"Failed to extract text from {pdf_file}: {e}")
        return ""

# Function to get the title of the PDF (assumes title is in the first page)
# Function to get the title of the PDF (assumes title is in the first page)
def extract_title_from_pdf(pdf_file):
    try:
        with pdfplumber.open(pdf_file) as pdf:
            first_page_text = pdf.pages[0].extract_text()
            
            # Check if the title line starts with '## Title:'
            title_line = first_page_text.split('\n')[0]  # Assumes title is the first line
            
            if title_line.startswith("## Title:"):
                title = title_line.replace("## Title:", "").strip()
            else:
                title = title_line.strip()
                
            return title
    except Exception as e:
        print(f"Failed to extract title from {pdf_file}: {e}")
        return ""


# Helper function to find the closest matching title in Scrape_df
def find_closest_title(pdf_title, title_list):
    closest_title = None
    highest_ratio = 0
    for title in title_list:
        ratio = SequenceMatcher(None, pdf_title, title).ratio()
        if ratio > highest_ratio:
            highest_ratio = ratio
            closest_title = title
    return closest_title if highest_ratio > 0.8 else None  # Threshold for similarity

# Create a list to store the full text for each paper
full_texts = []
title_column = Scrape_df['Title'].tolist()  # List of titles in the dataframe

# Iterate over the PDF files in the pdfs directory
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_title = extract_title_from_pdf(pdf_path)
        
        if pdf_title:
            # Find the closest matching title in the dataframe
            matched_title = find_closest_title(pdf_title, title_column)
            
            if matched_title:
                # Extract text from the PDF
                full_text = extract_text_from_pdf(pdf_path)
                print(f'Full text extracted for "{matched_title}": {bool(full_text)}')  # Verify text extraction
                
                # Assign full text to the appropriate row in the dataframe
                Scrape_df.loc[Scrape_df['Title'] == matched_title, 'Full_Text'] = full_text
            else:
                print(f"No matching title found in the dataframe for PDF title: {pdf_title}")
        else:
            print(f"Could not extract title from PDF: {pdf_file}")

# Display the dataframe to verify
print("Full text successfully added to the dataframe where matches were found.")
Scrape_df.head()


No matching title found in the dataframe for PDF title: BP4D-Spontaneous: a high-resolution spontaneous 3D
No matching title found in the dataframe for PDF title: Decline or improvement?
Full text extracted for "Deep convolution network based emotion analysis towards mental health care": True
No matching title found in the dataframe for PDF title: Deep spatial-temporal feature fusion for facial expression
No matching title found in the dataframe for PDF title: Development of a Real-Time Emotion Recognition System
No matching title found in the dataframe for PDF title: JavaScript is disabled on your browser. Please enable JavaScript to use all
No matching title found in the dataframe for PDF title: Emotional facial expressions evoke faster orienting
Full text extracted for "Extended deep neural network for facial emotion recognition": True
No matching title found in the dataframe for PDF title: Facial expression recognition with Convolutional Neural
Full text extracted for "Facial expre

Unnamed: 0.1,Unnamed: 0,Title,Authors,Year,Cited By,Detected_Dataset,Detected_Topic,Abstract,DOI,Journal,...,Mentions_P-value,Mentions_T-test,Mentions_Anova,Mentions_Correlation,Mentions_Regression,Mentions_Baseline_comparison,Mentions_Mae,Mentions_Rmse,Mentions_Bias,Full_Text
0,0,10 Automated Face Analysis for Affective Compu...,"['JF Cohn', 'F De la Torre']",2015,170,Affective Faces Database,classifier,Differences in manual coding between databases...,No DOI,The Oxford handbook of affective …,...,False,False,False,False,False,False,False,False,False,
1,1,3D facial expression recognition based on auto...,"['H Tang', 'TS Huang']",2008,205,Binghamton University 3D Facial Expression,"classification, classifier, facial expression ...",facial expression recognition from 3D facial s...,No DOI,… on computer vision and pattern recognition …,...,False,False,False,False,False,False,False,False,False,
2,2,3D facial expression recognition based on prim...,"['J Wang', 'L Yin', 'X Wei', 'Y Sun']",2006,440,Binghamton University 3D Facial Expression,facial expression recognition,expressions using 3D facial expression range d...,No DOI,… Vision and Pattern Recognition …,...,False,False,False,False,False,False,False,False,False,
3,3,3D facial expression recognition based on prop...,"['H Tang', 'TS Huang']",2008,153,Binghamton University 3D Facial Expression,"classification, classifier, facial expression ...",Binghamton University have recently constructe...,No DOI,… on Automatic Face & Gesture Recognition,...,False,False,False,False,False,False,False,False,False,
4,4,3D facial expression recognition using SIFT de...,"['S Berretti', 'B Ben Amor', 'M Daoudi', 'A De...",2011,184,Binghamton University 3D Facial Expression,"classification, classifier, facial expression ...",at the Binghamton University (BU-3DFE database...,No DOI,The Visual Computer,...,False,False,False,False,False,False,False,False,False,


In [9]:
Scrape_df['Full_Text'].count()

17

In [10]:
Scrape_df.to_csv('FullTextscrapes_scienceDirect.csv')