In [23]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

def translate_m2m100(english_text, src_lang="en", tgt_lang="tr"):
    tokenizer.src_lang = src_lang
    inputs = tokenizer(english_text, return_tensors="pt", padding=True)
    generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)




In [24]:
english_text = "okay"
turkish_translation = translate_m2m100(english_text)
print(f"English: {english_text}")
print(f"Turkish: {turkish_translation}")

English: okay
Turkish: Tamam


In [25]:
def translate_file(input_file, output_file, src_lang="en", tgt_lang="tr"):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()  # Read file line by line to preserve formatting

    translated_lines = []
    for line in lines:
        if line.strip():  # Translate only non-empty lines
            translated_line = translate_m2m100(line.strip(), src_lang, tgt_lang)
            translated_lines.append(translated_line)
        else:
            translated_lines.append("")  # Preserve blank lines

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("\n".join(translated_lines))  # Join lines back with newline characters

    print(f"Translation saved to {output_file}")

translate_file("/content/sample_data/input.txt", "translated_output.txt", "en", "tr")

Translation saved to translated_output.txt


In [26]:
!pip install PyPDF2




In [27]:
!pip install pdfplumber





In [28]:
import pdfplumber

def extract_text_from_pdf(pdf_path, output_txt):
    """Extracts text from a PDF and saves it to an intermediate text file while preserving formatting."""
    text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_text = page.extract_text()
            if extracted_text:
                # Preserve page breaks for clarity
                text += f"\n--- Page {page_num} ---\n{extracted_text}\n"
            else:
                text += f"\n--- Page {page_num} (No text extracted) ---\n"

    if not text.strip():
        print("No extractable text found in the PDF.")
        return None

    # Save extracted text to a file
    with open(output_txt, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"Extracted text saved to {output_txt}")
    return output_txt

# Example usage
pdf_path = "/content/sample_data/Customer_sentiment_analysis_synopsis.pdf"
output_txt = "extracted_text.txt"
extract_text_from_pdf(pdf_path, output_txt)


Extracted text saved to extracted_text.txt


'extracted_text.txt'

In [29]:
with open("extracted_text.txt", 'r', encoding="utf-8") as f:
    extracted_text = f.read()
    print(extracted_text)


--- Page 1 ---
Guru Gobind Singh Indraprastha University
CUSTOMER SENTIMENT AND FEEDBACK ANALYSIS
MAJOR PROJECT SYNOPSIS
Submitted by: Submitted to:
Name: MS. Kanika
Gurleen Kaur Bali
Roll Number:
03290302022
Course and section: BCA M1

--- Page 2 ---
Gurleen Kaur Bali 03290302022 Major Project Synopsis
SYNOPSIS OF THE PROJECT:
Title of the Project:
Customer Sentiment and feedback Analysis
Statement about the Problem:
Customer feedback plays a crucial role in shaping business strategies. With the rise of e-commerce and
online platforms, customers frequently leave reviews about products and services. However, manually
analysing thousands of reviews is time-consuming and inefficient. Businesses need an automated
system to analyse customer sentiments accurately and provide actionable insights. This project aims to
develop a sentiment analysis system that classifies customer reviews as positive, negative, or neutral
and predicts a corresponding star rating out of 5.
Why is the Particular 

In [30]:
translate_file("extracted_text.txt", "translated_output_pdf.txt", "en", "tr")


Translation saved to translated_output_pdf.txt


Collecting zemberek-python
  Downloading zemberek_python-0.2.3-py3-none-any.whl.metadata (2.7 kB)
Collecting antlr4-python3-runtime==4.8 (from zemberek-python)
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading zemberek_python-0.2.3-py3-none-any.whl (95.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141214 sha256=2a664e5f5a5a9ea2f9266c87972800b386ab497199bdf3b1194b3f0ec2457681
  Stored in directory: /root/.cache/pip/wheels/21/10/be/9a70640a3a60ed4a7e1a45e49bb9f58b04692d5d7

INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 30.246526956558228


2025-02-04 16:55:25,760 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 30.246526956558228

Corrected text saved to corrected_translated_output_pdf.txt


In [33]:
from transformers import BertForMaskedLM, BertTokenizer
import torch
import re

# Load Turkish BERT model and tokenizer
model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

def preprocess_text(file_path):
    """
    Reads and cleans the translated Turkish text from a file.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Remove unwanted characters and normalize spaces
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\wşçöğüİıŞÇÖĞÜa-zA-Z.,!? ]", "", text)
    text = text.strip()

    return text

def compute_perplexity(text):
    """
    Computes a score for the text using the BERT Masked Language Model.
    This is used as a proxy to assess the fluency of the text.
    """
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        perplexity = torch.exp(loss)

    return perplexity.item()

# Example Usage:
file_path = "translated_output_pdf.txt"
cleaned_text = preprocess_text(file_path)
perplexity_score = compute_perplexity(cleaned_text)

print(f"Perplexity Score: {perplexity_score}")


Perplexity Score: 1.4937987327575684


In [34]:
pip install textstat




In [35]:
import textstat

def readability_score(text):
    score = textstat.flesch_reading_ease(text)
    return score

def check_readability_of_translated_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    score = readability_score(text)
    print(f"Readability Score: {score}")

# Call this function with your file
check_readability_of_translated_file('translated_output_pdf.txt')


Readability Score: 40.35


In [36]:
from sentence_transformers import SentenceTransformer, util

def check_semantic_similarity(original, translated):
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # Pre-trained multilingual model
    embeddings = model.encode([original, translated])
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return similarity.item()

def check_semantic_similarity_of_translated_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        translated_text = file.read()

    # Example original sentence in English
    original = "This is a sample English sentence."

    similarity_score = check_semantic_similarity(original, translated_text)
    print(f"Semantic Similarity Score: {similarity_score}")

# Call this function with your file
check_semantic_similarity_of_translated_file('translated_output_pdf.txt')


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu


2025-02-04 16:55:50,660 - sentence_transformers.SentenceTransformer - INFO
Msg: Use pytorch device_name: cpu



INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2


2025-02-04 16:55:50,667 - sentence_transformers.SentenceTransformer - INFO
Msg: Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score: 0.10829651355743408
