In [2]:
import requests
from bs4 import BeautifulSoup
import csv

base_url = "https://cplusplus.com/articles/?page="
base_domain = "https://cplusplus.com"

items = []
page_no = 1

# Step 1: Scrape main articles page to find subsection links (like Algorithms)
subsection_links = []

while True:
    response = requests.get(base_url + str(page_no))

    if response.status_code == 404:
        print(f"Page {page_no} does not exist. Ending scraping.")
        break
    elif response.status_code != 200:
        raise Exception(f"Failed to load page no {page_no}: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all subsection links (e.g., "Algorithms")
    subsections = soup.find_all('div', class_='sect')  # Adjust class if necessary
    for subsection in subsections:
        links = subsection.find_all('a')
        for link in links:
            subsection_links.append(base_domain + link['href'])

    # Process articles on this page
    s_items = soup.find_all('td', class_='elem')

    if not s_items:
        break

    for s_item in s_items:
        anchor = s_item.find('a')
        
        if anchor:
            title = anchor.find('span', class_='title').text.strip()
            link = anchor['href'].strip()
            if link.startswith('/'):
                link = base_domain + link
            items.append({'TITLE': title, 'URL': link, 'SECTION': 'articles'})

    print(f"Page {page_no} processed with {len(s_items)} items.")
    page_no += 1

# Step 2: Scrape each subsection link
for link in subsection_links:
    page_no = 1
    while True:
        response = requests.get(link + f"?page={page_no}")
        
        if response.status_code == 404:
            print(f"Page {page_no} in subsection {link} does not exist. Moving to next subsection.")
            break
        elif response.status_code != 200:
            raise Exception(f"Failed to load page no {page_no} in subsection {link}: {response.status_code}")
        
        soup = BeautifulSoup(response.content, 'html.parser')
        s_items = soup.find_all('td', class_='elem')

        if not s_items:
            break

        for s_item in s_items:
            anchor = s_item.find('a')
            
            if anchor:
                title = anchor.find('span', class_='title').text.strip()
                article_link = anchor['href'].strip()
                if article_link.startswith('/'):
                    article_link = base_domain + article_link
                items.append({'TITLE': title, 'URL': article_link, 'SECTION': 'subsection'})

        print(f"Page {page_no} in subsection {link} processed with {len(s_items)} items.")
        page_no += 1

# Write all collected items to CSV
csv_name = 'cplusplus_articles.csv'
with open(csv_name, mode='w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['TITLE', 'URL', 'SECTION']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for ite in items:
        writer.writerow({'TITLE': ite['TITLE'], 'URL': ite['URL'], 'SECTION': ite['SECTION']})

print(f"Data has been written to {csv_name}. Total articles retrieved: {len(items)}")


Page 1 processed with 20 items.
Page 2 processed with 20 items.
Page 3 processed with 20 items.
Page 4 processed with 20 items.
Page 5 processed with 20 items.
Page 6 processed with 20 items.
Page 7 processed with 20 items.
Page 8 processed with 20 items.
Page 9 processed with 20 items.
Page 10 processed with 20 items.
Page 11 processed with 10 items.
Page 12 does not exist. Ending scraping.
Page 1 in subsection https://cplusplus.com/articles/ processed with 20 items.
Page 2 in subsection https://cplusplus.com/articles/ processed with 20 items.
Page 3 in subsection https://cplusplus.com/articles/ processed with 20 items.
Page 4 in subsection https://cplusplus.com/articles/ processed with 20 items.
Page 5 in subsection https://cplusplus.com/articles/ processed with 20 items.
Page 6 in subsection https://cplusplus.com/articles/ processed with 20 items.
Page 7 in subsection https://cplusplus.com/articles/ processed with 20 items.
Page 8 in subsection https://cplusplus.com/articles/ proces

In [9]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk

# Download the punkt tokenizer for word tokenization
nltk.download('punkt')

# Step 1: Load the CSV file into a DataFrame
input_csv = 'cplusplus_articles.csv'  # Replace with your CSV file path
df = pd.read_csv(input_csv)

# Step 2: Tokenize the text in a specific column
# Assuming you want to tokenize the 'TITLE' column
df['TOKENIZED_TITLE'] = df['TITLE'].apply(word_tokenize)

# Step 3: Save the tokenized data to a new CSV file
output_csv = 'tokenized_cplusplus_articles.csv'
df.to_csv(output_csv, index=False)

print(f"Tokenization complete. Tokenized data saved to {output_csv}.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokenization complete. Tokenized data saved to tokenized_cplusplus_articles.csv.


In [16]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
import string
import contractions

# Download the Punkt tokenizer for word tokenization
nltk.download('punkt')

# Function to normalize text
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Join tokens back to string (if you want to keep it tokenized, you can skip this step)
    normalized_text = ' '.join(tokens)
    
    return normalized_text

# Step 1: Load the CSV file into a DataFrame
input_csv = 'cplusplus_articles.csv'  # Replace with your CSV file path
df = pd.read_csv(input_csv)

# Step 2: Apply normalization to the 'TITLE' column
df['NORMALIZED_TITLE'] = df['TITLE'].apply(normalize_text)

# Step 3: Save the normalized data to a new CSV file
output_csv = 'normalized_cplusplus_articles.csv'
df.to_csv(output_csv, index=False)

print(f"Text normalization complete. Normalized data saved to {output_csv}.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Text normalization complete. Normalized data saved to normalized_cplusplus_articles.csv.


In [18]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
import contractions

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to normalize text
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Join tokens back to string (if you want to keep it tokenized, you can skip this step)
    normalized_text = ' '.join(tokens)
    
    return normalized_text

# Function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))  # Use a set for faster lookup
    tokens = word_tokenize(text)  # Tokenize the text again if needed
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Filter out stop words
    filtered_text = ' '.join(filtered_tokens)  # Join filtered tokens back into a string
    return filtered_text

# Step 1: Load the CSV file into a DataFrame
input_csv = 'normalized_cplusplus_articles.csv'  # Your normalized CSV file
df = pd.read_csv(input_csv)

# Step 2: Remove stop words from the 'NORMALIZED_TITLE' column
df['TITLE_NO_STOPWORDS'] = df['NORMALIZED_TITLE'].apply(remove_stop_words)

# Step 3: Save the data with stop words removed to a new CSV file
output_csv = 'no_stopwords_cplusplus_articles.csv'
df.to_csv(output_csv, index=False)

print(f"Stop words removal complete. Data saved to {output_csv}.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Stop words removal complete. Data saved to no_stopwords_cplusplus_articles.csv.


In [9]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import string
import contractions

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to normalize text
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Join tokens back to string (if you want to keep it tokenized, you can skip this step)
    normalized_text = ' '.join(tokens)
    
    return normalized_text

# Function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))  # Use a set for faster lookup
    tokens = word_tokenize(text)  # Tokenize the text again if needed
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Filter out stop words
    filtered_text = ' '.join(filtered_tokens)  # Join filtered tokens back into a string
    return filtered_text

# Function for stemming using Porter Stemmer algorithm
def stem_text(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_text = ' '.join(stemmed_tokens)
    return stemmed_text

# Function for lemmatization using WordNet
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

# Step 1: Load the CSV file into a DataFrame
input_csv = 'cplusplus_articles.csv'  # Use the file after normalization and stop word removal
df = pd.read_csv(input_csv)

# Step 2: Apply stop word removal to the 'NORMALIZED_TITLE' column
df['TITLE_NO_STOPWORDS'] = df['TITLE'].apply(remove_stop_words)

# Step 3: Apply stemming and lemmatization to the 'TITLE_NO_STOPWORDS' column
df['STEMMED_TITLE'] = df['TITLE_NO_STOPWORDS'].apply(stem_text)
df['LEMMATIZED_TITLE'] = df['TITLE_NO_STOPWORDS'].apply(lemmatize_text)

# Step 4: Save the processed data to a new CSV file
output_csv = 'new_processed_cplusplus_articles.csv'
df.to_csv(output_csv, index=False)

print(f"Text processing complete. Data saved to {output_csv}.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Text processing complete. Data saved to new_processed_cplusplus_articles.csv.


In [7]:

from nltk.tokenize import word_tokenize
a="i am running"
t=word_tokenize(a)
print(t)

['i', 'am', 'running']


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import string
import contractions

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to normalize text
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Join tokens back to string (if you want to keep it tokenized, you can skip this step)
    normalized_text = ' '.join(tokens)
    
    return normalized_text

# Function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))  # Use a set for faster lookup
    tokens = word_tokenize(text)  # Tokenize the text again if needed
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Filter out stop words
    filtered_text = ' '.join(filtered_tokens)  # Join filtered tokens back into a string
    return filtered_text

# Function for stemming using Porter Stemmer algorithm
def stem_text(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_text = ' '.join(stemmed_tokens)
    return stemmed_text

# Function for lemmatization using WordNet
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

# Step 1: Load the CSV file into a DataFrame
input_csv = 'cplusplus_articles.csv'  # Your initial CSV file
df = pd.read_csv(input_csv)

# Step 2: Apply normalization to the 'TITLE' column
df['NORMALIZED_TITLE'] = df['TITLE'].apply(normalize_text)

# Step 3: Remove stop words from the 'NORMALIZED_TITLE' column
df['TITLE_NO_STOPWORDS'] = df['NORMALIZED_TITLE'].apply(remove_stop_words)

# Step 4: Apply stemming and lemmatization to the 'TITLE_NO_STOPWORDS' column
df['STEMMED_TITLE'] = df['TITLE_NO_STOPWORDS'].apply(stem_text)
df['LEMMATIZED_TITLE'] = df['TITLE_NO_STOPWORDS'].apply(lemmatize_text)

# Step 5: Save the processed data to a new CSV file
output_csv = 'update_processed_cplusplus_articles.csv'
df.to_csv(output_csv, index=False)

print(f"Text processing complete. Data saved to {output_csv}.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Text processing complete. Data saved to update_processed_cplusplus_articles.csv.


In [4]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import string
import contractions

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to normalize text
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Define a translation table that removes punctuation except '+'
    translator = str.maketrans('', '', string.punctuation.replace('+', ''))
    # Remove punctuation except '+'
    text = text.translate(translator)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Join tokens back to string (if you want to keep it tokenized, you can skip this step)
    normalized_text = ' '.join(tokens)
    
    return normalized_text

# Function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))  # Use a set for faster lookup
    tokens = word_tokenize(text)  # Tokenize the text again if needed
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Filter out stop words
    filtered_text = ' '.join(filtered_tokens)  # Join filtered tokens back into a string
    return filtered_text

# Function for stemming using Porter Stemmer algorithm
def stem_text(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_text = ' '.join(stemmed_tokens)
    return stemmed_text

# Function for lemmatization using WordNet
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

# Step 1: Load the CSV file into a DataFrame
input_csv = 'cplusplus_articles.csv'  # Your initial CSV file
df = pd.read_csv(input_csv)

# Step 2: Apply normalization to the 'TITLE' column
df['NORMALIZED_TITLE'] = df['TITLE'].apply(normalize_text)

# Step 3: Remove stop words from the 'NORMALIZED_TITLE' column
df['TITLE_NO_STOPWORDS'] = df['NORMALIZED_TITLE'].apply(remove_stop_words)

# Step 4: Apply stemming and lemmatization to the 'TITLE_NO_STOPWORDS' column
df['STEMMED_TITLE'] = df['TITLE_NO_STOPWORDS'].apply(stem_text)
df['LEMMATIZED_TITLE'] = df['TITLE_NO_STOPWORDS'].apply(lemmatize_text)

# Step 5: Save the processed data to a new CSV file
output_csv = 'pro_processed_cplusplus_articles.csv'
df.to_csv(output_csv, index=False)

print(f"Text processing complete. Data saved to {output_csv}.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Text processing complete. Data saved to pro_processed_cplusplus_articles.csv.


In [15]:
import nltk
from nltk.corpus import wordnet

# Download required resources
nltk.download('wordnet')
nltk.download('omw-1.4')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

def get_antonyms(word):
    antonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())
    return antonyms

def get_hypernyms(word):
    hypernyms = set()
    for syn in wordnet.synsets(word):
        for hypernym in syn.hypernyms():
            hypernyms.add(hypernym.name().split('.')[0])  # Get the name of the hypernym
    return hypernyms

# Example words
words = ['c++', 'interpreter', 'token','program']

# Create ontology
ontology = {}
for word in words:
    ontology[word] = {
        'synonyms': list(get_synonyms(word)),
        'antonyms': list(get_antonyms(word)),
        'hypernyms': list(get_hypernyms(word))
    }

print("Ontology Created:")
for word, relationships in ontology.items():
    print(f"\nWord: {word}")
    print(f"  Synonyms: {relationships['synonyms']}")
    print(f"  Antonyms: {relationships['antonyms']}")
    print(f"  Hypernyms: {relationships['hypernyms']}")


Ontology Created:

Word: c++
  Synonyms: []
  Antonyms: []
  Hypernyms: []

Word: interpreter
  Synonyms: ['interpretive_program', 'spokesperson', 'translator', 'representative', 'interpreter', 'voice']
  Antonyms: []
  Hypernyms: ['program', 'advocate', 'person', 'mediator']

Word: token
  Synonyms: ['tokenish', 'relic', 'souvenir', 'token', 'keepsake', 'nominal', 'item']
  Antonyms: []
  Hypernyms: ['sign', 'disk', 'object', 'symbol']

Word: program
  Synonyms: ['curriculum', 'political_program', 'plan', 'platform', 'political_platform', 'computer_programme', 'syllabus', 'course_of_study', 'computer_program', 'program', 'broadcast', 'programme']
  Antonyms: []
  Hypernyms: ['system', 'software', 'announcement', 'information', 'schedule', 'document', 'show', 'create_by_mental_act', 'performance', 'idea']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dines\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
len(df['TITLE'])


9351

In [10]:
import pandas as pd
from nltk.tokenize import word_tokenize

input_csv='pro_processed_cplusplus_articles.csv'
df=pd.read_csv(input_csv)
tokens=[]

for index,row in df.iterrows():
    words=row['TITLE'].lower().split()
    for word in words:
        if word not in tokens:
            tokens.append(word)
    

print(tokens)


    

['how', 'to', 'avoid', 'bugs', 'using', 'modern', 'c++', 'learning', 'computer', 'programming', 'terminology', 'class', 'for', 'generate', 'fibonacci', 'series', 'casting', 'safe', 'clearing', 'of', 'private', 'data', 'i', 'learned', 'a', 'vital', 'borland', 'coding', 'technique', "couldn't", 'learn', 'alone', 'sierpinski', 'triangle', 'fractal', '-', 'the', 'easiest', 'way', 'produce', 'randomness', 'koch', 'one', 'algorithms', 'with', 'graphics', '10', 'tips', 'be', 'productive', 'in', 'clion,', 'cross-platform', 'c/c++', 'ide', 'fibonacii', 'at', 'its', 'best', 'null', 'pointer', 'dereferencing', 'causes', 'undefined', 'behavior', 'android', 'kernel:', 'lacking', 'modularity', 'design', 'pattern', 'state,', 'simple', 'problem', 'semaphore', 'opengl', 'animation', 'glfw,', 'step', 'by', 'split', 'string', 'declarations,', 'prototypes,', 'definitions,', 'and', 'implementations', 'finding', "skype's", 'default', 'account', 'name', 'virtual', 'method', 'table', 'accident', 'prevention',

In [12]:
len(tokens)

609

In [29]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Load the CSV file
input_csv = 'pro_processed_cplusplus_articles.csv'
df = pd.read_csv(input_csv)

# Assume the text data is in a column named 'corpus'
corpus = df['TITLE'].tolist()




In [21]:
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [51]:
import pandas as pd
from collections import Counter

# Load the processed data
input_csv = 'pro_processed_cplusplus_articles.csv'
df = pd.read_csv(input_csv)

# Extract tokens from the 'TITLE_NO_STOPWORDS' column
tokens = []
for index, row in df.iterrows():
    words = row['TITLE_NO_STOPWORDS'].lower().split()
    for word in words:
        if word not in tokens:
            tokens.append(word)

# Calculate token frequencies
token_frequencies = Counter()

# Iterate over each title to count token frequencies
for index, row in df.iterrows():
    title_words = row['TITLE_NO_STOPWORDS'].lower().split()
    frequencies = Counter(title_words)
    token_frequencies.update(frequencies)

# Convert token frequencies to a DataFrame
token_freq_df = pd.DataFrame(token_frequencies.items(), columns=['Token', 'Frequency'])

# Save Token Frequencies to CSV
token_freq_csv = 'token_frequencies.csv'
token_freq_df.to_csv(token_freq_csv, index=False)

print(f"Token frequencies created and saved to {token_freq_csv}.")


Token frequencies created and saved to token_frequencies.csv.


In [49]:
import pandas as pd

# Load the processed data
input_csv = 'pro_processed_cplusplus_articles.csv'
df = pd.read_csv(input_csv)

# Extract unique tokens from the 'TITLE_NO_STOPWORDS' column
tokens = []
for index, row in df.iterrows():
    words = row['TITLE_NO_STOPWORDS'].lower().split()
    for word in words:
        if word not in tokens:
            tokens.append(word)

# Create Term-Document Matrix
term_document_matrix = []
for index, row in df.iterrows():
    title_words = row['TITLE_NO_STOPWORDS'].lower().split()
    vector = [1 if token in title_words else 0 for token in tokens]
    term_document_matrix.append(vector)

# Convert Term-Document Matrix to DataFrame
tdm_df = pd.DataFrame(term_document_matrix, columns=tokens, index=[i + 1 for i in range(len(df))])

# Save Term-Document Matrix to CSV
tdm_csv = 'term_document_matrix.csv'
tdm_df.to_csv(tdm_csv)

print(f"Term-Document Matrix created and saved to {tdm_csv}.")

# Create Inverted Index
inverted_index = {token: [] for token in tokens}
for index, row in df.iterrows():
    title_id = index + 1  # Adjust for 1-based indexing
    title_words = row['TITLE_NO_STOPWORDS'].lower().split()
    for token in tokens:
        if token in title_words:
            if title_id not in inverted_index[token]:
                inverted_index[token].append(title_id)

# Save Inverted Index to CSV
inverted_index_csv = 'inverted_index.csv'
with open(inverted_index_csv, 'w') as f:
    for token, doc_ids in inverted_index.items():
        f.write(f"{token},{','.join(map(str, doc_ids))}\n")

print(f"Inverted Index created and saved to {inverted_index_csv}.")


Term-Document Matrix created and saved to term_document_matrix.csv.
Inverted Index created and saved to inverted_index.csv.


In [61]:
import pandas as pd
from collections import defaultdict

# Load the processed data
input_csv = 'pro_processed_cplusplus_articles.csv'
df = pd.read_csv(input_csv)

# Initialize the term frequency and posting list dictionary
term_frequency = defaultdict(int)
posting_list = defaultdict(list)

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Convert the sentence index from DataFrame to a 1-based index
    doc_id = index + 1
    
    # Get the tokens for the current title (assuming 'TITLE_NO_STOPWORDS' column)
    title_words = row['TITLE_NO_STOPWORDS'].lower().split()
    
    # Create a set of unique words to avoid counting the same word multiple times in a single document
    unique_words = set(title_words)
    
    # Update term frequency and posting list
    for word in unique_words:
        term_frequency[word] += 1
        posting_list[word].append(doc_id)

# Prepare the DataFrame to display term frequency and posting list
data = {
    'words': [],
    'term_frequency': [],
    'posting_list': []
}

for word in sorted(term_frequency):
    data['words'].append(word)
    data['term_frequency'].append(term_frequency[word])
    data['posting_list'].append(' -->'.join(map(str, posting_list[word])) + ' -->')

# Create the DataFrame
result_df = pd.DataFrame(data)

# Save to CSV if needed
result_csv = 'term_frequency_posting_list.csv'
result_df.to_csv(result_csv, index=False)
print(f"Term Frequency and Posting List saved to {result_csv}.")




Term Frequency and Posting List saved to term_frequency_posting_list.csv.


In [65]:
input_search = input("Enter the search string (tokens separated by space): ")
tokens_to_search = input_search.lower().split()

# Search tokens without a function
search_results = {}

# Directly search for each token in the posting list
for token in tokens_to_search:
    # Fetch the posting list for the token
    if token in posting_list:
        search_results[token] = posting_list[token]
    else:
        search_results[token] = []

# Display the search results
for token, docs in search_results.items():
    if docs:
        print(f"Token '{token}' is found in documents: {docs}")
    else:
        print(f"Token '{token}' is not found in any document.")

Enter the search string (tokens separated by space):  binary search


Token 'binary' is found in documents: [47, 90, 133, 257, 300, 343, 467, 510, 553, 655, 742, 941, 1088, 1131, 1174, 1298, 1341, 1384, 1486, 1573, 1772, 1919, 1962, 2005, 2129, 2172, 2215, 2317, 2404, 2603, 2750, 2793, 2836, 2960, 3003, 3046, 3148, 3235, 3434, 3581, 3624, 3667, 3791, 3834, 3877, 3979, 4066, 4265, 4412, 4455, 4498, 4622, 4665, 4708, 4810, 4897, 5096, 5243, 5286, 5329, 5453, 5496, 5539, 5641, 5728, 5927, 6074, 6117, 6160, 6284, 6327, 6370, 6472, 6559, 6758, 6905, 6948, 6991, 7115, 7158, 7201, 7303, 7390, 7589, 7736, 7779, 7822, 7946, 7989, 8032, 8134, 8221, 8420, 8567, 8610, 8653, 8777, 8820, 8863, 8965, 9052, 9251]
Token 'search' is found in documents: [49, 133, 259, 343, 469, 553, 655, 855, 1090, 1174, 1300, 1384, 1486, 1686, 1921, 2005, 2131, 2215, 2317, 2517, 2752, 2836, 2962, 3046, 3148, 3348, 3583, 3667, 3793, 3877, 3979, 4179, 4414, 4498, 4624, 4708, 4810, 5010, 5245, 5329, 5455, 5539, 5641, 5841, 6076, 6160, 6286, 6370, 6472, 6672, 6907, 6991, 7117, 7201, 7303, 750

In [69]:
input_sentence = input("Enter the search sentence: ")
tokens_to_search = word_tokenize(input_sentence.lower())  # Tokenize the sentence into words

# Initialize an empty dictionary to store search results
search_results = {}

# Directly search for each token in the posting list
for token in tokens_to_search:
    # Fetch the posting list for the token
    if token in posting_list:
        search_results[token] = posting_list[token]
    else:
        search_results[token] = []

# Display the search results
for token, docs in search_results.items():
    if docs:
        print(f"Token '{token}' is found in documents: {docs}")
    else:
        print(f"Token '{token}' is not found in any document.")

Enter the search sentence:  dinesh is trying to learn c++ and java and mastered in pointers


Token 'dinesh' is not found in any document.
Token 'is' is not found in any document.
Token 'trying' is not found in any document.
Token 'to' is not found in any document.
Token 'learn' is found in documents: [6, 83, 120, 121, 216, 293, 330, 331, 426, 503, 540, 541, 708, 757, 758, 836, 925, 991, 1015, 1047, 1124, 1161, 1162, 1257, 1334, 1371, 1372, 1539, 1588, 1589, 1667, 1756, 1822, 1846, 1878, 1955, 1992, 1993, 2088, 2165, 2202, 2203, 2370, 2419, 2420, 2498, 2587, 2653, 2677, 2709, 2786, 2823, 2824, 2919, 2996, 3033, 3034, 3201, 3250, 3251, 3329, 3418, 3484, 3508, 3540, 3617, 3654, 3655, 3750, 3827, 3864, 3865, 4032, 4081, 4082, 4160, 4249, 4315, 4339, 4371, 4448, 4485, 4486, 4581, 4658, 4695, 4696, 4863, 4912, 4913, 4991, 5080, 5146, 5170, 5202, 5279, 5316, 5317, 5412, 5489, 5526, 5527, 5694, 5743, 5744, 5822, 5911, 5977, 6001, 6033, 6110, 6147, 6148, 6243, 6320, 6357, 6358, 6525, 6574, 6575, 6653, 6742, 6808, 6832, 6864, 6941, 6978, 6979, 7074, 7151, 7188, 7189, 7356, 7405, 7406, 7