# **Q1 Data Preprocessing**

### **a. Lowercase the Text**

In [None]:
import os

# Function to lowercase the text of a file and save it
def lowercase_file(file_path, output_path):
    with open(file_path, 'r') as f:
        text = f.read().lower()

    with open(output_path, 'w') as f:
        f.write(text)

# Function to print the contents of a file
def print_file_contents(file_path):
    with open(file_path, 'r') as f:
        contents = f.read()
        print(f"Contents of file {file_path}:\n{contents}\n")

# Folder paths
input_folder = '/content/drive/MyDrive/Colab Notebooks/text_files'
output_folder = '/content/preprocessed_text_files/lowercase'

# Ensure the output folder exists, create it if necessary
os.makedirs(output_folder, exist_ok=True)

# Get a list of all files in the input folder
files = os.listdir(input_folder)

# counter to track the no of printed files
count=0

for file_name in files:
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, file_name)

    # Lowercase the text and save the file
    lowercase_file(input_file_path, output_file_path)

    if(count<5):
      # Print contents of the original and lowercase files
      print_file_contents(input_file_path)
      print_file_contents(output_file_path)
      count+=1

Contents of file /content/drive/MyDrive/Colab Notebooks/text_files/file890.txt:
My 3rd Joyo Pedal, I'm falling in love with that company, solid great sounding pedals for a fraction of other brands.
Be advised, the effect of this pedal is very subtle... I use it with my mustang V, which already models other amps, this pedal just makes it sound way more realistic and adds some dynamics to your playing.
 Totally worth it for anybody that wants to improve their tone... You have to have a good ear though.
I've read some reviews of people hooking this pedal before their amp input... since this pedal has it's own pre amp, this way of hooking it up will produce some noise.
 Connect it to your FX send return.
I also own the American sound .... both great... depending on my mood I play them both equally... You just can't go wrong with Joyo

Contents of file /content/preprocessed_text_files/lowercase/file890.txt:
my 3rd joyo pedal, i'm falling in love with that company, solid great sounding pedal

### **b. Perform Tokenization**

In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize

# Download the punkt tokenizer if not already downloaded
nltk.download('punkt')

# Function to tokenize the text of a file and save it
def tokenize_file(file_path, output_path):
    with open(file_path, 'r') as f:
        text = f.read()
        tokens = word_tokenize(text)

    with open(output_path, 'w') as f:
        f.write(" ".join(tokens))

# Folder paths
input_folder = '/content/preprocessed_text_files/lowercase'
output_folder = '/content/preprocessed_text_files/tokenized'

# Ensure the output folder exists, create it if necessary
os.makedirs(output_folder, exist_ok=True)

# Get a list of all files in the input folder
files = os.listdir(input_folder)

# counter to track the no of printed files
count=0

# Process all files in the input folder
for file_name in files:
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, file_name)

    # Tokenize the text and save the file
    tokenize_file(input_file_path, output_file_path)

    # Print contents of the 5 original and processed files
    if(count<5):
      print_file_contents(input_file_path)
      print_file_contents(output_file_path)
      count+=1

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Contents of file /content/preprocessed_text_files/lowercase/file571.txt:
i have a garage sale piece of junk mij hollowbody that i've used amazon to build for cheap.  it's been a blast and this fit the bill perfectly.

i cannot tell you how surprised i am at how good this sounds with this guitar.  it has inspired me to play open tuning slide because the tone through a simple boss distortion pedal is so cool.

i would absolutely purchase this again for another cheapo build!

if you're looking at this, buy it.  don't think twice.

Contents of file /content/preprocessed_text_files/tokenized/file571.txt:
i have a garage sale piece of junk mij hollowbody that i 've used amazon to build for cheap . it 's been a blast and this fit the bill perfectly . i can not tell you how surprised i am at how good this sounds with this guitar . it has inspired me to play open tuning slide because the tone through a simple boss distortion pedal is so cool . i would absolutely purchase this again for another 

### **c. Remove Stopwords**

In [None]:
import os
import nltk
from nltk.corpus import stopwords

# Download the stopwords if not already downloaded
nltk.download('stopwords')

# Function to remove stopwords from a list of tokens and save it
def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    return filtered_tokens

# Folder paths
input_folder = '/content/preprocessed_text_files/tokenized'
output_folder = '/content/preprocessed_text_files/without_stopwords'

# Ensure the output folder exists, create it if necessary
os.makedirs(output_folder, exist_ok=True)

# Get a list of all files in the input folder
files = os.listdir(input_folder)

# counter to track no of printed files
count=0

# Process all files in the input folder
for file_name in files:
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, file_name)

    # Read the tokenized text from the file
    with open(input_file_path, 'r') as f:
        tokens = nltk.word_tokenize(f.read())

    # Remove stopwords from the list of tokens and save the file
    tokens = remove_stopwords(tokens)

    with open(output_file_path, 'w') as f:
        f.write(" ".join(tokens))

    # Print contents of the 5 original and processed files
    if(count<5):
     print_file_contents(input_file_path)
     print_file_contents(output_file_path)
     count+=1

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Contents of file /content/preprocessed_text_files/tokenized/file571.txt:
i have a garage sale piece of junk mij hollowbody that i 've used amazon to build for cheap . it 's been a blast and this fit the bill perfectly . i can not tell you how surprised i am at how good this sounds with this guitar . it has inspired me to play open tuning slide because the tone through a simple boss distortion pedal is so cool . i would absolutely purchase this again for another cheapo build ! if you 're looking at this , buy it . do n't think twice .

Contents of file /content/preprocessed_text_files/without_stopwords/file571.txt:
garage sale piece junk mij hollowbody 've used amazon build cheap . 's blast fit bill perfectly . tell surprised good sounds guitar . inspired play open tuning slide tone simple boss distortion pedal cool . would absolutely purchase another cheapo build ! 're looking , buy . n't think twice .

Contents of file /content/preprocessed_text_files/tokenized/file144.txt:
cool littl

### **d. Remove Punctuations**

In [None]:
import os
import string
import nltk
from nltk.corpus import stopwords

# Download the stopwords if not already downloaded
nltk.download('stopwords')

# Function to remove punctuation from a string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Folder paths
input_folder = '/content/preprocessed_text_files/without_stopwords'
output_folder = '/content/preprocessed_text_files/without_punctuations'

# Ensure the output folder exists, create it if necessary
os.makedirs(output_folder, exist_ok=True)

# Get a list of all files in the input folder
files = os.listdir(input_folder)

# counter to track no of printed files
count=0

# Process all files in the input folder
for file_name in files:
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, file_name)

    # Read the text from the file
    with open(input_file_path, 'r') as f:
        text = f.read()

    # Remove punctuation from the text and save the file
    cleaned_text = remove_punctuation(text)
    with open(output_file_path, 'w') as f:
        f.write(cleaned_text)

    # Print contents of the 5 original and processed files
    if(count<5):
     print_file_contents(input_file_path)
     print_file_contents(output_file_path)
     count+=1

Contents of file /content/preprocessed_text_files/without_stopwords/file571.txt:
garage sale piece junk mij hollowbody 've used amazon build cheap . 's blast fit bill perfectly . tell surprised good sounds guitar . inspired play open tuning slide tone simple boss distortion pedal cool . would absolutely purchase another cheapo build ! 're looking , buy . n't think twice .

Contents of file /content/preprocessed_text_files/without_punctuations/file571.txt:
garage sale piece junk mij hollowbody ve used amazon build cheap  s blast fit bill perfectly  tell surprised good sounds guitar  inspired play open tuning slide tone simple boss distortion pedal cool  would absolutely purchase another cheapo build  re looking  buy  nt think twice 

Contents of file /content/preprocessed_text_files/without_stopwords/file144.txt:
cool little portable sound amplifier , got make noise without waking neighborhood ! light , hope 's impact resistant stuff back pack goodies . 'm using harmonica , slide guitar

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### **e. Remove blank space tokens**

In [None]:
import os
import string
import nltk

# Function to remove blank space tokens from a list of tokens
def remove_blank_space(tokens):
    return [token.strip() for token in tokens if token.strip()]

# Folder paths
input_folder = '/content/preprocessed_text_files/without_punctuations'
output_folder = '/content/preprocessed_text_files/final_cleaned'

# Ensure the output folder exists, create it if necessary
os.makedirs(output_folder, exist_ok=True)

# Get a list of all files in the input folder
files = os.listdir(input_folder)

# counter to track no of printed files
count=0

# Process all files in the input folder
for file_name in files:
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, file_name)

    # Read the text from the file
    with open(input_file_path, 'r') as f:
        text = f.read()

    # Tokenize the cleaned text
    tokens = nltk.word_tokenize(text)

    # Remove blank space tokens from the list of tokens
    cleaned_tokens = remove_blank_space(tokens)

    # Save the final cleaned file
    with open(output_file_path, 'w') as f:
        f.write(" ".join(cleaned_tokens))

    # Print contents of the 5 original and processed files
    if(count<5):
     print_file_contents(input_file_path)
     print_file_contents(output_file_path)
     count+=1

Contents of file /content/preprocessed_text_files/without_punctuations/file571.txt:
garage sale piece junk mij hollowbody ve used amazon build cheap  s blast fit bill perfectly  tell surprised good sounds guitar  inspired play open tuning slide tone simple boss distortion pedal cool  would absolutely purchase another cheapo build  re looking  buy  nt think twice 

Contents of file /content/preprocessed_text_files/final_cleaned/file571.txt:
garage sale piece junk mij hollowbody ve used amazon build cheap s blast fit bill perfectly tell surprised good sounds guitar inspired play open tuning slide tone simple boss distortion pedal cool would absolutely purchase another cheapo build re looking buy nt think twice

Contents of file /content/preprocessed_text_files/without_punctuations/file144.txt:
cool little portable sound amplifier  got make noise without waking neighborhood  light  hope s impact resistant stuff back pack goodies  m using harmonica  slide guitar  powered speaker media play

# **Q2. Unigram Inverted Index and Boolean Queries**

## **a. Unigram Index Construction**

In [None]:
import pickle

# Function to read all files in a directory and return a list of file paths
def get_file_paths(directory):
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

# Function to tokenize text
def tokenize(text):
    return text.split()

# Function to extract document ID from file name
def extract_doc_id(file_path):
    file_name = os.path.basename(file_path)
    return int(file_name.split('.')[0].split('file')[1])

# Function to create a unigram inverted index
def create_inverted_index(file_paths):
    inverted_index = {}
    for file_path in file_paths:
        doc_id = extract_doc_id(file_path)
        with open(file_path, 'r') as file:
            text = file.read()
            tokens = tokenize(text)
            for token in tokens:
                if token not in inverted_index:
                    inverted_index[token] = set()
                inverted_index[token].add(doc_id)
    return inverted_index

# Path to the directory containing the preprocessed files
preprocessed_folder = '/content/preprocessed_text_files/final_cleaned'

# Get file paths
file_paths = get_file_paths(preprocessed_folder)

# Create the inverted index
unigram_inverted_index = create_inverted_index(file_paths)

# Save the inverted index using pickle
with open('unigram_inverted_index.pickle', 'wb') as f:
    pickle.dump(unigram_inverted_index, f)

## **Query Support**

In [None]:
# Function to perform AND operation
def perform_and(term1_docs, term2_docs):
    return term1_docs.intersection(term2_docs)

# Function to perform OR operation
def perform_or(term1_docs, term2_docs):
    return term1_docs.union(term2_docs)

# Function to perform AND NOT operation
def perform_and_not(term1_docs, term2_docs):
    return term1_docs.difference(term2_docs)

# Function to perform OR NOT operation
def perform_or_not(term1_docs, term2_docs, all_docs):
    return all_docs.difference(term2_docs).union(term1_docs)

# Function to perform query operation
def perform_query(inverted_index, query):
    result = inverted_index[query.terms[0]]
    all_docs = set(range(1, 1000))
    for term, operator in zip(query.terms[1:], query.operators):
        set2 = inverted_index[term] if term in inverted_index else set()
        # print(f"{result} {operator} {set2} = ", end=' ')
        if operator == 'AND':
            result = perform_and(result, set2)
        elif operator == 'OR':
            result = perform_or(result, set2)
        elif operator == 'AND NOT':
            result = perform_and_not(result, set2)
        elif operator == 'OR NOT':
            result = perform_or_not(result, set2, all_docs)
        # print(result)
    return result

# Function to print the results of a query
def print_query_results(query_num, result_docs):
    print(f"Query {query_num}:")
    print(f"Number of documents retrieved for query {query_num}: {len(result_docs)}")
    print(f"Names of the documents retrieved for query {query_num}: {', '.join(['file' + str(doc) + '.txt' for doc in sorted(result_docs)])}")

# Load the inverted index from the pickle file
with open('unigram_inverted_index.pickle', 'rb') as f:
    loaded_inverted_index = pickle.load(f)

# object to store queries
class Query:
    def __init__(this, text, operations):
      this.terms = preprocess(text)
      this.operators = remove_blank_space(operations.upper().split(','))

# perform prepocessing like lowering case, removing stopwords, punctuations, on the text
def preprocess(text):
    tokens = remove_blank_space(remove_stopwords(remove_punctuation(text.lower()).split()))
    return tokens

# list of queries
queries = []

# take user input
N = int(input())
for i in range(N):
    queries.append(Query(input(),input()))

# # process each query
for i, query in enumerate(queries, start=1):
    print_query_results(i, perform_query(loaded_inverted_index, query))

technique:  {918, 23}
