In [2]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import time
import re


In [2]:
text_file_dir = "../data/wikispeedia_articles_plaintext/plaintext_articles/"
file_data = []

for filename in os.listdir(text_file_dir):
    if filename.endswith(".txt"):  # Ensure you're only reading .txt files
        with open(os.path.join(text_file_dir, filename), 'r', encoding='utf-8') as file:
            content = file.read()
        
            # Split the content into lines to remove the header
            lines = content.split('\n')

            # Removing the header (line 0) and join back the content
            if lines:
                title = lines.pop(2)  # Assuming the first line is the title
            
            content = ' '.join(lines).replace("\n", " ")
            
            # Create a dictionary with title and content
            file_data.append({"title": title, "text_content": content})

# Create a DataFrame
project_data = pd.DataFrame(file_data)

# Display the DataFrame structure
print(project_data.head())

article_titles = project_data['title'].tolist()
article_titles


                         title  \
0               Second Crusade   
1               Navassa Island   
2             Evan Rachel Wood   
3  Tropical Storm Henri (2003)   
4      Final Fantasy Adventure   

                                        text_content  
0     #copyright   2007 Schools Wikipedia Selecti...  
1     #copyright   2007 Schools Wikipedia Selecti...  
2     #copyright   2007 Schools Wikipedia Selecti...  
3     #copyright   2007 Schools Wikipedia Selecti...  
4     #copyright   2007 Schools Wikipedia Selecti...  


['Second Crusade',
 'Navassa Island',
 'Evan Rachel Wood',
 'Tropical Storm Henri (2003)',
 'Final Fantasy Adventure',
 'Tornado',
 'John W. Campbell',
 'Whooping Crane',
 'Shigin',
 'Hafnium',
 'Hawaiian Goose',
 'Ankylosaurus',
 'Andriyivskyy Descent',
 'Supermarine Spitfire',
 'Carl Jung',
 'Fine art',
 'Arithmetic',
 'Taj Mahal',
 'Social capital',
 'Nepal',
 'Book of Common Prayer',
 'African Buffalo',
 'Luminiferous aether',
 'Chicago',
 'Giza pyramid complex',
 'Lyme disease',
 'Humour',
 'Imperative programming',
 'Brain',
 'Coconut',
 'Tern',
 'Wars of the Roses',
 'Davis Cup',
 'Great Tit',
 'Wave–particle duality',
 'Jean Charles de Menezes',
 'Local community',
 'Music of Spain',
 'Terik',
 'Bernard Hinault',
 "Hickman's potentilla",
 'Eris (dwarf planet)',
 'Intelligence',
 'Margaret Sanger',
 'Bretton Woods system',
 'Glastonbury Festival',
 'Oxygen',
 'Gerald Ford',
 'Antarctic Treaty System',
 'John Locke',
 'Martin Luther King, Jr.',
 'Six-party talks',
 'Coot',
 'Nint

In [3]:

len(article_titles)

4604

In [4]:

def construct_url_RationalWiki(title):
    if not isinstance(title, str):
        return None  # Return None or handle the error as appropriate

    normalized_title = title.lower().replace(' ', '_')  # Convert title to lowercase and replace spaces with underscores
    return f"https://rationalwiki.org/wiki/{normalized_title}"

def construct_url_Infogalactic(title):
    if not isinstance(title, str):
        return None  # Return None or handle the error as appropriate
    normalized_title = title.lower().replace(' ', '_')  # Convert title to lowercase and replace spaces with underscores
    return f"https://infogalactic.com/info/{normalized_title}"

def construct_url_Conservapedia(title):
    normalized_title = title.lower().replace(' ', '_')  # Convert title to lowercase and replace spaces with underscores
    return f"https://www.conservapedia.com/{normalized_title}"


In [7]:
#WARNING - This cell may take a long time to run (90min) - risk of blocked IP address because of too many requests
results = []

for title in article_titles:
    url = construct_url_RationalWiki(title)
    response = requests.get(url)

    # Store the title and the response status code
    results.append({'Article Title': title, 'Response Status': response.status_code})

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Optionally, save the DataFrame to a CSV file
df.to_csv('article_status_results_RationalWiki.csv', index=False)

print(df)

                    Article Title  Response Status
0                  Second Crusade              404
1                  Navassa Island              404
2                Evan Rachel Wood              404
3     Tropical Storm Henri (2003)              404
4         Final Fantasy Adventure              404
...                           ...              ...
4599                      Réunion              404
4600                       Flower              404
4601                     Banknote              404
4602               Weyto language              404
4603                    Marseille              404

[4604 rows x 2 columns]


In [5]:
article_status_results_RationalWiki = pd.read_csv('./rationalWiki_article_status_results.csv')
article_status_results_RationalWiki['Response Status'].value_counts()


Response Status
404    3923
200     613
503      68
Name: count, dtype: int64

In [6]:

def get_article_index(title):
    return article_titles.index(title)
print(get_article_index('Nepal'))

19


In [7]:
#Test the html extract of rational wiki just for one article     

print(article_titles[19])
url = construct_url_RationalWiki(article_titles[19])
response = requests.get(url)
#print('Response status code: {0}\n'.format(response.status_code))
#print('Response headers: {0}\n'.format(response.headers))
#print('Response body: {0}'.format(response.text))
soup = BeautifulSoup(response.content, 'html.parser')

# Remove the specific navigation box
nav_box = soup.find('div', {'role': 'navigation', 'aria-labelledby': 'Nations_of_the_world-navigationbox'})
if nav_box:
    nav_box.decompose()

paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
text_content = ''
for para in paragraphs:
    for edit_link in para.find_all('span', class_="mw-editsection"): 
        edit_link.decompose()

    # Replace tags with spaces directly in the HTML
    html_with_spaces = re.sub(r'<[^>]+>', ' ', str(para))

    # Now create a new soup from the modified HTML
    new_soup = BeautifulSoup(html_with_spaces, 'html.parser')

    # Extract text
    text = new_soup.get_text(strip=True, separator=' ')
    if text:
        text_content += text + '\n'
    
print(text_content)

Nepal
Nepal
The  Federal Democratic Republic of Nepal  is a country north of  India  best known for its Himalayas mountains. Nepal has at least partial control of eight of the world's ten tallest peaks, including Mt. Everest, Kanchenjunga, and Lhotse.  [2]   Most of Nepal's population is  Hindu , but there are significant minorities of  Buddhists  and  Muslims . Nepal's religious traditions go back a long way, and there are a variety of notable temples scattered across the country that attract tourism.  [3]   The country's capital and largest city is Kathmandu.
But Nepal isn't all backpackers and monasteries. Nepal recently transitioned into a  republic  in 2008 after the abolition of the  monarchy , and it still struggles with political disputes between Maoists and  conservatives . Nepal's economy suffers from widespread poverty and other significant problems like poor infrastructure, a major obstacle to industrialization. Most of the people are employed in the agricultural sector. Un

In [12]:
#Create the dataframe for rationalwiki with the layout usable with the NLP file.
df = pd.read_csv('./rationalWiki_article_status_results.csv')
df_rationalwiki_titles = df[df['Response Status'] == 200]
df_rationalwiki_titles = df_rationalwiki_titles.drop(columns=['Response Status'])
df_rationalwiki_titles = df_rationalwiki_titles.rename(columns={'Article Title': 'title'})
display(df_rationalwiki_titles)

Unnamed: 0,title
19,Nepal
22,Luminiferous aether
23,Chicago
25,Lyme disease
26,Humour
...,...
4574,Bahrain
4578,Flood
4586,Neptune
4588,Hinduism


In [13]:
#Functions to extract the plain text of the website https://rationalwiki.org/ for every page that are present
# in both RationalWiki and Wikispedia dataset

def extract_text_rationalwiki(url):
    response = requests.get(url)
    if response.status_code != 200:
        return "Error: Unable to fetch article"

    soup = BeautifulSoup(response.content, 'html.parser')

    # Remove the specific navigation box
    nav_box = soup.find('div', {'role': 'navigation', 'aria-labelledby': 'Nations_of_the_world-navigationbox'})
    if nav_box:
        nav_box.decompose()

    paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
    text_content = ''
    for para in paragraphs:
        for edit_link in para.find_all('span', class_="mw-editsection"): 
            edit_link.decompose()

        # Replace tags with spaces directly in the HTML
        html_with_spaces = re.sub(r'<[^>]+>', ' ', str(para))

        # Now create a new soup from the modified HTML
        new_soup = BeautifulSoup(html_with_spaces, 'html.parser')

        # Extract text
        text = new_soup.get_text(strip=True, separator=' ')
        if text:
            text_content += text + '\n'
    
    return text_content

    
# Main function to update DataFrame with text content
def update_df_with_text_content(df, csv_filename):
    texts = []
    for title in df['title']:
        url = construct_url_RationalWiki(title)
        if url is not None:
            text = extract_text_rationalwiki(url)
        else:
            text = "Invalid URL or title"  # Handle the case where URL is None
        texts.append(text)
        #time.sleep(1)  # Respectful crawl delay
    df['text_content'] = texts

    # Save the updated DataFrame to a CSV file
    df.to_csv(csv_filename, index=False)
    return df


In [58]:
#WARNING - This cell takes a long time to run
# Create the dataframe for rationalwiki with the plain text of articles.
df_rationalwiki_articles = update_df_with_text_content(df_rationalwiki_titles,'600_rationalwiki_common_articles.csv')

#if you want to use the DF delete the comment on the next line :
#df_rationalwiki_articles = pd.read_csv('./600_rationalwiki_common_articles.csv')

print(df_rationalwiki_articles) 

  new_soup = BeautifulSoup(html_with_spaces, 'html.parser')


                   title                                       text_content
0                  Nepal  Nepal\nThe  Federal Democratic Republic of Nep...
1    Luminiferous aether  Luminiferous aether\nEther , or æther, was  th...
2                Chicago  Fun:Chicago\nChicago , of the oblast of  Illin...
3           Lyme disease  Lyme disease\nLyme disease   is an infectious ...
4                 Humour  Fun:Humeur\nHumeur  can refer to:\nHumeur, esp...
..                   ...                                                ...
608              Bahrain  Bahrain\nThe  Kingdom of Bahrain  is a tiny is...
609                Flood  Global flood\nThe global flood  is a (fairly s...
610              Neptune  Neptune\nNeptune  could use some help. Please ...
611             Hinduism  Hinduism\nHinduism  (not to be confused with  ...
612          Electricity  Fun:Electricity\nElectricity  is  a mystery fr...

[608 rows x 2 columns]


In [45]:
#Get articles titles on Infogalactic that are in our Wikispedia Dataset 
 ##WARNING - This cell may take a long time to run (90min) - risk of blocked IP address because of too many requests
results = [] 

for title in article_titles:
    url = construct_url_Infogalactic(title)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers, timeout = 10)
    # Store the title and the response status code
    results.append({'Article Title': title, 'Response Status': response.status_code})

# Create a DataFrame from the results
df_infogalactic_titles = pd.DataFrame(results)

# Optionally, save the DataFrame to a CSV file
df_infogalactic_titles.to_csv('infogalactic_article_status_results.csv', index=False)

print(df_infogalactic_titles)

#How many corresponding titles beetween InfoGalactic and wikispedia
df_infogalactic_titles['Response Status'].value_counts()


"\nfor title in article_titles:\n    url = construct_url_Infogalactic(title)\n    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}\n    response = requests.get(url, headers=headers, timeout = 10)\n    # Store the title and the response status code\n    results.append({'Article Title': title, 'Response Status': response.status_code})\n\n# Create a DataFrame from the results\ndf_infogalactic_titles = pd.DataFrame(results)\n\n# Optionally, save the DataFrame to a CSV file\ndf_infogalactic_titles.to_csv('infogalactic_article_status_results.csv', index=False)\n\nprint(df_infogalactic_titles)\n\n#How many corresponding titles beetween InfoGalactic and wikispedia\ndf_infogalactic_titles['Response Status'].value_counts()\n"

In [10]:

#Create the dataframe for InfoGalactic with the layout for NLP file
df_infogalactic_titles = pd.read_csv('./infogalactic_article_status_results.csv')
df_infogalactic_titles = df_infogalactic_titles[df_infogalactic_titles['Response Status'] == 200]
df_infogalactic_titles = df_infogalactic_titles.drop(columns=['Response Status'])
df_infogalactic_titles = df_infogalactic_titles.rename(columns={'Article Title': 'title'})
display(df_infogalactic_titles)

Unnamed: 0,title
0,Second Crusade
5,Tornado
7,Whooping Crane
8,Shigin
9,Hafnium
...,...
4599,Réunion
4600,Flower
4601,Banknote
4602,Weyto language


In [11]:

#Test of html extract for infogalactic 
print(article_titles[19])
url = construct_url_Infogalactic(article_titles[19])
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers, timeout=10)

#print('Response status code: {0}\n'.format(response.status_code))
#print('Response headers: {0}\n'.format(response.headers))
#print('Response body: {0}'.format(response.text))

# Decode response content and apply regex
html_content = response.content.decode('utf-8')
cleaned_html = re.sub(r'<templatestyles.*?>.*?</templatestyles>', '', html_content, flags=re.DOTALL | re.IGNORECASE)


soup = BeautifulSoup(cleaned_html, 'html.parser')

# Find and remove all <templatestyles> tags
for template_style in soup.find_all('templatestyles'):
    template_style.extract()


# Process paragraphs and headings
paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

text_output = ''
for para in paragraphs:
    for edit_link in para.find_all('span', class_="mw-editsection"): 
        edit_link.decompose()

    # Replace tags with spaces directly in the HTML
    html_with_spaces = re.sub(r'<[^>]+>', ' ', str(para))

    # Now create a new soup from the modified HTML
    new_soup = BeautifulSoup(html_with_spaces, 'html.parser')

    # Extract text
    text = new_soup.get_text(strip=True, separator=' ')
    if text:
        text_output += text + '\n'

print(text_output)





Nepal
Nepal
<templatestyles src="Module:Hatnote/styles.css"></templatestyles>
Nepal  (        i      /  n  ə  ˈ  p  ɔː  l  /   ;  [8]    Nepali :  नेपाल    [neˈpal]    (     listen ) ), officially the  Federal Democratic Republic of Nepal ,  [9]   is a  landlocked   country  located in  South Asia . With an area of 147,181 square kilometres (56,827 sq mi) and a population of approximately 27 million,  [2]   Nepal is the world's  93rd largest  country by area  [10]   and the  41st most populous  country. It is located in the  Himalayas  and bordered to the north by  China  and to the south, east, and west by  India . Nepal is separated from  Bangladesh  by the narrow Indian  Siliguri Corridor  and from  Bhutan  by the Indian state of  Sikkim .  Kathmandu  is the nation's  capital city  and largest metropolis.
The mountainous north of Nepal has eight of the world's ten  tallest mountains , including the highest point on Earth,  Mount Everest  ( Nepali :  सगरमाथा   Sagarmāthā ). More than

  new_soup = BeautifulSoup(html_with_spaces, 'html.parser')


In [14]:
# Remove duplicates from both DataFrames (df_infogalactic_titles, df_rationalwiki_articles)
df_infogalactic_titles = df_infogalactic_titles.drop_duplicates(subset='title')
df_rationalwiki_titles = df_rationalwiki_titles.drop_duplicates(subset='title')

# Perform the merge in order to have the articles titles that appears on rationalwiki, infogalactic and wikispeedia
common_titles_df = pd.merge(df_infogalactic_titles, df_rationalwiki_articles, on='title', how='inner')

#common_titles_df = common_titles_df.drop(columns=['text_content'])
# Display the DataFrames and the length of the merged DataFrame
display(common_titles_df)
print(len(common_titles_df.title.unique()))



Unnamed: 0,title
0,Nepal
1,Luminiferous aether
2,Chicago
3,Lyme disease
4,Humour
...,...
608,Bahrain
609,Flood
610,Neptune
611,Hinduism


608


In [36]:

#Functions to extract the plain text of the website infogalatic for every page that are present
# in both Infolgalactic and Wikispedia dataset

def extract_text_infogalactic(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    response = requests.get(url, headers=headers, timeout=10)
    if response.status_code != 200:
        return "Error: Unable to fetch article"
    
    html_content = response.content.decode('utf-8')
    cleaned_html = re.sub(r'<templatestyles.*?>.*?</templatestyles>', '', html_content, flags=re.DOTALL | re.IGNORECASE)

    soup = BeautifulSoup(cleaned_html, 'html.parser')

    # Find and remove all <templatestyles> tags
    for template_style in soup.find_all('templatestyles'):
        template_style.extract()
    

    paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
    text_content = ''
    for para in paragraphs:
        for edit_link in para.find_all('span', class_="mw-editsection"): 
            edit_link.decompose()

        # Replace tags with spaces directly in the HTML
        html_with_spaces = re.sub(r'<[^>]+>', ' ', str(para))

        # Now create a new soup from the modified HTML
        new_soup = BeautifulSoup(html_with_spaces, 'html.parser')

        # Extract text
        text = new_soup.get_text(strip=True, separator=' ')
        if text:
            text_content += text + '\n'
    
    return text_content

    
# Main function to update DataFrame with text content
def update_df_with_text_content_Infogalactic(df, csv_filename):
    texts = []
    for title in df['title']:
        url = construct_url_Infogalactic(title)
        if url is not None:
            text = extract_text_infogalactic(url)
        else:
            text = "Invalid URL or title"  # Handle the case where URL is None
        texts.append(text)
        #time.sleep(1)  # Respectful crawl delay
    df['text_content'] = texts

    # Save the updated DataFrame to a CSV file
    df.to_csv(csv_filename, index=False)
    return df




In [37]:
##WARNING this cell might take a long time to run
#Scrap the articles on infogalactic that are both present in wikispeedia and rationalwiki
df_infogalactic_articles = update_df_with_text_content_Infogalactic(common_titles_df, '600_infogalactic_common_articles.csv')

#if you want to use the DF delete the comment on the next line :
#df_infogalactic_articles = pd.read_csv('scraped_data/infogalactic_articles.csv')

display(df_infogalactic_articles)

  new_soup = BeautifulSoup(html_with_spaces, 'html.parser')


Unnamed: 0,title,text_content
0,Nepal,"Nepal\n<templatestyles src=""Module:Hatnote/sty..."
1,Luminiferous aether,"Luminiferous aether\nIn the late 19th century,..."
2,Chicago,"Chicago\n<templatestyles src=""Module:Hatnote/s..."
3,Lyme disease,"Lyme disease\nLyme disease , also known as Ly..."
4,Humour,"Humour\n<templatestyles src=""Module:Hatnote/st..."
...,...,...
608,Bahrain,"Bahrain\n<templatestyles src=""Module:Hatnote/s..."
609,Flood,"Flood\n<templatestyles src=""Module:Hatnote/sty..."
610,Neptune,"Neptune\n<templatestyles src=""Module:Hatnote/s..."
611,Hinduism,Hinduism\nLua error in package.lua at line 80:...


In [38]:
# Function to get a random RationalWiki article title
def get_random_rationalwiki_title():
    response = requests.get('https://rationalwiki.org/wiki/Special:Random')
    soup = BeautifulSoup(response.content, 'html.parser')
    
    if soup.title and soup.title.string:
        return soup.title.string.split(' - RationalWiki')[0]
    else:
        return None

# Function to check if a title exists on InfoGalactic
def check_infogalactic(title):
    url = construct_url_Infogalactic(title)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        # Check if response is successful and title is in the response text
        is_successful = response.status_code == 200 and title in response.text
        return is_successful, is_successful
    except requests.exceptions.RequestException as e:
        return False, str(e)


In [2]:
#WARNING - This cell takes a long time to run - It finds 3000 articles that are in both RationalWiki and InfoGalactic but not in wikispeedia 
#this will allow us to train our model.

# Initialize an empty DataFrame
training_articles2 = pd.read_csv('training_articles_titles.csv')

# Loop until collection of 3000 unique articles
while len(training_articles2) < 3000:
    random_title = get_random_rationalwiki_title()

    # Check conditions
    success, message = check_infogalactic(random_title)
    if random_title not in article_titles and random_title not in training_articles2['Title'].values and success:
        # Add to DataFrame if conditions are met
        training_articles2.loc[len(training_articles2)] = [random_title]

    # Save to CSV file after every 100 additions
    if len(training_articles2) % 100 == 0:
        training_articles2.to_csv('training_articles_titles.csv', index=False)

print("Collection complete. Final list saved to 'training_articles_titles.csv'.")


'\n# Initialize an empty DataFrame\ntraining_articles2 = pd.read_csv(\'training_articles2.csv\')\n\n# Loop until collection of 3000 unique articles\nwhile len(training_articles2) < 3000:\n    random_title = get_random_rationalwiki_title()\n\n    # Check conditions\n    success, message = check_infogalactic(random_title)\n    if random_title not in article_titles and random_title not in training_articles2[\'Title\'].values and success:\n        # Add to DataFrame if conditions are met\n        training_articles2.loc[len(training_articles2)] = [random_title]\n\n    # Save to CSV file after every 100 additions\n    if len(training_articles2) % 100 == 0:\n        training_articles2.to_csv(\'training_articles2.csv\', index=False)\n\nprint("Collection complete. Final list saved to \'training_articles2.csv\'.")'

In [40]:
training_articles = pd.read_csv('./training_articles_titles.csv')
training_articles


Unnamed: 0,Title
0,Electroconvulsive therapy
1,Juche
2,Special pleading
3,Chrislam
4,Black box
...,...
2995,Four-term fallacy
2996,Nosode
2997,Very special episode
2998,The Man in the High Castle


In [41]:
#WARNING - Cell take a long time to run - It will extract the plain text of the 3000 articles that are in both RationalWiki and InfoGalactic but not in wikispeedia
training_articles = training_articles.rename(columns={'Title': 'title'})
df_infogalactic_training_articles = update_df_with_text_content_Infogalactic(training_articles, '3000_infogalactic_training_articles.csv')
df_rationalwiki_training_articles = update_df_with_text_content(training_articles, '3000_rationalwiki_training_articles.csv')

  new_soup = BeautifulSoup(html_with_spaces, 'html.parser')
  new_soup = BeautifulSoup(html_with_spaces, 'html.parser')


In [43]:
df_rationalwiki_training_articles = pd.read_csv('./3000_rationalwiki_training_articles.csv')
df_infogalactic_training_articles = pd.read_csv('./3000_infogalactic_training_articles.csv')
error_count1 = df_rationalwiki_training_articles['text_content'].value_counts().get('Error: Unable to fetch article', 0)
error_count2 = df_infogalactic_training_articles['text_content'].value_counts().get('Error: Unable to fetch article', 0)

print(f"Error count for RationalWiki: {error_count1}") 
print(f"Error count for InfoGalactic: {error_count2}")


Error count for RationalWiki: 782
Error count for InfoGalactic: 0


In [44]:
df_rationalwiki_training_articles = df_rationalwiki_training_articles[df_rationalwiki_training_articles['text_content'] != 'Error: Unable to fetch article']
error_count1 = df_rationalwiki_training_articles['text_content'].value_counts().get('Error: Unable to fetch article', 0)
error_count2 = df_infogalactic_training_articles['text_content'].value_counts().get('Error: Unable to fetch article', 0)

df_rationalwiki_training_articles.to_csv('./3000_rationalwiki_training_articles.csv')

print(f"Error count for RationalWiki: {error_count1}")
print(f"Error count for InfoGalactic: {error_count2}")

Error count for RationalWiki: 0
Error count for InfoGalactic: 0


In [30]:
import csv
import sys

# Increase the maximum field size limit
csv.field_size_limit(sys.maxsize)

# Text patterns to remove
patterns_to_remove = [
    '<templatestyles src="Module:Hatnote/styles.css"></templatestyles>',
    '<templatestyles src="Template:Blockquote/styles.css"/>',
    '<templatestyles src=""Template:Blockquote/styles.css""/>',
    ' <templatestyles src=""Template:Blockquote/styles.css"" /> ', 
    '<templatestyles src="Div col/styles.css"/>',
    '<templatestyles src="Template:TOC limit/styles.css"/>',
    '<templatestyles src=""Template:TOC limit/styles.css"" />',
    '<templatestyles src=""Template:TOC limit/styles.css"" />', 
    '<templatestyles src=""Template:TOC limit/styles.css"" />', 
    '<templatestyles src=""Template:Blockquote/styles.css"" />', 
    '<templatestyles src=""Template:TOC limit/styles.css"" />',
    '<templatestyles src=""Noitalic/styles.css""/>',
    '"<templatestyles src=""""Template:Quote_box/styles.css"""" />"', 
    '"<templatestyles src=""""Template:Quote_box/styles.css"""" />"',
    '<templatestyles src=""""Sfrac/styles.css"""" />',
    '<templatestyles src=""""Sfrac/styles.css"""" />',
    'templatestyles src=""""Sfrac/styles.css""""', 
    '<templatestyles src=""""Script/styles_hebrew.css"""" />'

   # Unescaped double quotes
]


input_file_path = './11_clean_600_infogalactic_common_articles.csv'  # Replace with your CSV file path
output_file_path = './12_clean_600_infogalactic_common_articles.csv'  # Replace with your desired output file path

with open(input_file_path, mode='r', encoding='utf-8') as infile, \
     open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for row in reader:
        modified_row = row
        for pattern in patterns_to_remove:
            # Replace each pattern with a line break in each row
            modified_row = [cell.replace(pattern, '\n') for cell in modified_row]
        writer.writerow(modified_row)

print("CSV file has been processed and saved.")


CSV file has been processed and saved.


In [32]:
import csv
import sys
import re

csv.field_size_limit(sys.maxsize)

# Regular expression pattern for matching the block of text
pattern_to_remove = re.compile(r'See also.*?Tools', re.DOTALL)

input_file_path = './12_clean_600_infogalactic_common_articles.csv'
output_file_path = './13_clean_600_infogalactic_common_articles.csv'

with open(input_file_path, mode='r', encoding='utf-8') as infile, \
     open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for row in reader:
        modified_row = [re.sub(pattern_to_remove, '', cell) for cell in row]
        writer.writerow(modified_row)

print("CSV file has been processed and saved.")




CSV file has been processed and saved.
