In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [3]:
# Load URLs from Excel
input_file = '/content/Input.xlsx'
df = pd.read_excel(input_file)
output_dir = 'extracted_articles'
os.makedirs(output_dir, exist_ok=True)

def extract_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title and text (customize based on HTML structure)
        title = soup.find('title').text.strip()
        article_body = soup.find('article') or soup.find('div', class_='article-content')

        if article_body:
            article_text = ' '.join(p.get_text() for p in article_body.find_all('p'))
        else:
            article_text = ''

        return title, article_text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None, None

# Iterate over URLs and save articles
for index, row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    title, article_text = extract_article(url)

    if article_text:
        file_path = os.path.join(output_dir, f"{url_id}.txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(f"Title: {title}\n\n{article_text}")

### Text Analysis

In [4]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl.metadata (3.3 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m566.1 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.4


### code to load custom lists

In [8]:
def load_word_list(file_path, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding, errors='ignore') as file:
        return [line.strip() for line in file]

# Try loading with different encodings if necessary
try:
    positive_words = load_word_list('/content/positive-words.txt')
    negative_words = load_word_list('/content/negative-words.txt')
except Exception as e:
    print(f"Error: {e}")

def load_stopwords(file_paths):
    stopwords = set()
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']  # List of encodings for referance

    for file_path in file_paths:
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as file:
                    stopwords.update(line.strip().lower() for line in file)
                break  # If successful, exit the encoding loop
            except UnicodeDecodeError:
                continue  # Try the next encoding
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                break
            except IOError as e:
                print(f"IO error occurred while reading {file_path}: {e}")
                break
    return stopwords

# Load stopwords
stopwords_files = [
    '/content/StopWords_Auditor.txt',
    '/content/StopWords_Currencies.txt',
    '/content/StopWords_DatesandNumbers.txt',
    '/content/StopWords_GenericLong.txt',
    '/content/StopWords_Generic.txt',
    '/content/StopWords_Geographic.txt',
    '/content/StopWords_Names.txt'
]

stopwords = load_stopwords(stopwords_files)


### Performing text analysis

In [9]:
pip install nltk textblob




In [11]:
import openpyxl

def read_output_structure(file_path):
    wb = openpyxl.load_workbook(file_path)
    ws = wb.active
    structure = [cell.value for cell in ws['A'] if cell.value]  # Assumes the variables are listed in column A
    return structure

output_structure_file = '/content/Output Data Structure.xlsx'
output_structure = read_output_structure(output_structure_file)

# Print the output structure to verify
print(f"Output structure: {output_structure}")


Output structure: ['URL_ID', 'bctech2011', 'bctech2012', 'bctech2013', 'bctech2014', 'bctech2015', 'bctech2016', 'bctech2017', 'bctech2018', 'bctech2019', 'bctech2020', 'bctech2021', 'bctech2022', 'bctech2023', 'bctech2024', 'bctech2025', 'bctech2026', 'bctech2027', 'bctech2028', 'bctech2029', 'bctech2030', 'bctech2031', 'bctech2032', 'bctech2033', 'bctech2034', 'bctech2035', 'bctech2036', 'bctech2037', 'bctech2038', 'bctech2039', 'bctech2040', 'bctech2041', 'bctech2042', 'bctech2043', 'bctech2044', 'bctech2045', 'bctech2046', 'bctech2047', 'bctech2048', 'bctech2049', 'bctech2050', 'bctech2051', 'bctech2052', 'bctech2053', 'bctech2054', 'bctech2055', 'bctech2056', 'bctech2057', 'bctech2058', 'bctech2059', 'bctech2060', 'bctech2061', 'bctech2062', 'bctech2063', 'bctech2064', 'bctech2065', 'bctech2066', 'bctech2067', 'bctech2068', 'bctech2069', 'bctech2070', 'bctech2071', 'bctech2072', 'bctech2073', 'bctech2074', 'bctech2075', 'bctech2076', 'bctech2077', 'bctech2078', 'bctech2079', 'bcte

In [12]:
import pandas as pd
import os
from textblob import TextBlob
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize, sent_tokenize
import re

# Download NLTK data (only need to do this once)
nltk.download('punkt')
nltk.download('cmudict')

# Load the CMU Pronouncing Dictionary
d = cmudict.dict()

# Function to count syllables
def syllable_count(word):
    word = word.lower()
    if word in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word]])
    else:
        return len(re.findall(r'[aeiouy]', word))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


In [14]:
# Function to perform text analysis
def analyze_text(text):
    # Tokenization
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    num_sentences = len(sentences)
    num_words = len(words)

    # Filtering
    filtered_words = [word for word in words if word.lower() not in stopwords]

    # Calculations
    syllables = sum(syllable_count(word) for word in filtered_words)
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    complex_word_count = sum(1 for word in filtered_words if syllable_count(word) > 2)
    word_count = num_words
    personal_pronouns = sum(word.lower() in {'i', 'me', 'my', 'mine', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves'} for word in filtered_words)
    avg_word_length = sum(len(word) for word in filtered_words) / word_count if word_count > 0 else 0
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count > 0 else 0

    # Sentiment Analysis using custom lists
    positive_score = sum(1 for word in filtered_words if word.lower() in positive_words)
    negative_score = sum(1 for word in filtered_words if word.lower() in negative_words)

    # TextBlob Analysis
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # FOG Index Calculation
    fog_index = (avg_sentence_length + percentage_complex_words) * 0.4

    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllables / word_count if word_count > 0 else 0,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }

# Load the extracted articles
output_dir = 'extracted_articles'
files = [f for f in os.listdir(output_dir) if f.endswith('.txt')]

# Prepare DataFrame to store results
results = []

for file_name in files:
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        article_id = os.path.splitext(file_name)[0]
        analysis = analyze_text(content)
        results.append({'URL_ID': article_id, **analysis})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Load the output structure file to ensure correct ordering
output_structure_file = '/content/Output Data Structure.xlsx'
structure_df = pd.read_excel(output_structure_file)

# Print columns to debug
print("Columns in Output Data Structure:", structure_df.columns)
print("Columns in results_df:", results_df.columns)

# Reorder the columns based on the output structure file
ordered_columns = ['URL_ID'] + [col for col in structure_df.columns if col in results_df.columns]
results_df = results_df[ordered_columns]

# Save the results to an Excel file
results_df.to_excel('text_analysis_results.xlsx', index=False)
print("Text analysis completed and results saved to 'text_analysis_results.xlsx'.")

Columns in Output Data Structure: Index(['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')
Columns in results_df: Index(['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')
Text analysis completed and results saved to 'text_analysis_results.xlsx'.


In [15]:
import os
print(os.listdir('.'))


['.config', 'text_analysis_results.xlsx', 'extracted_articles', 'StopWords_Geographic.txt', 'StopWords_Currencies.txt', 'StopWords_Names.txt', 'Input.xlsx', 'negative-words.txt', 'Output Data Structure.xlsx', 'positive-words.txt', 'StopWords_Auditor.txt', 'StopWords_Generic.txt', 'StopWords_GenericLong.txt', 'StopWords_DatesandNumbers.txt', 'sample_data']


In [16]:
results_df.to_csv('text_analysis_results.csv', index=False)


In [17]:
from google.colab import files
files.download('text_analysis_results.xlsx')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>