### 1. Text/ Data Pre processing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string

# Download necessary NLTK data
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

text = "Natural Language Processing with Python can be fun and exciting. Let's explore it!"

# Tokenization
tokens = word_tokenize(text)
print(f"original words: {tokens}")
print('-'*125)
# Removing Stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
print(f"Filtered Tokens: {filtered_tokens}")
print('-'*125)
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
print(f"Lemmatized Tokens: {lemmatized_tokens}")
print('-'*125)
# POS Tagging
pos_tags = pos_tag(lemmatized_tokens)
print(f"POS Tags: {pos_tags}")

### 2. Tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
#nltk.download('punkt')

text = "Natural Language Processing with Python can be fun and exciting. Let's explore it!"

# Tokenization
tokens = word_tokenize(text)
print(f"Original words: {tokens}")

### 3. Stemming

In [None]:
import nltk
from nltk.stem import PorterStemmer

# Download necessary NLTK data (only needed once)
#nltk.download('punkt')

text = "Natural Language Processing with Python can be fun and exciting."

# Tokenization
tokens = nltk.word_tokenize(text)

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

print(f"Original words: {tokens}")
print('-'*125)
print(f"Stemmed Tokens: {stemmed_tokens}")


### 4. Lemmatization

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (only needed once)
#nltk.download('punkt')
#nltk.download('wordnet')

text = "Natural Language Processing with Python can be fun and exciting."

# Tokenization
tokens = nltk.word_tokenize(text)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

print(f"Original words: {tokens}")
print('-'*125)
print(f"Lemmatized Tokens: {lemmatized_tokens}")


### 5. Part of Speech Tagging

In [None]:
#importing libraries
import spacy

#load the english language module
nlp = spacy.load("en_core_web_sm")

#sample text
text= "Natural Language Processing with Python can be fun and exciting."

#process the text with SpaCy
doc = nlp(text)

#Display the PoS tagged result
print("Original Text: ",text)
print('-'*125)
print("PoS Tagging Result: ")
for token in doc:
    print(f"{token.text}:{token.pos_}")

### 6. WordCloud

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt

# Download necessary NLTK data (only needed once)
#nltk.download('punkt')
#nltk.download('stopwords')

# Load text from a CSV file
#df = pd.read_csv('data.csv')  # Ensure 'data.csv' is in the same directory
# Assuming the CSV has a column named 'text'
#csv_text = ' '.join(df['text'].astype(str))  # Join all text entries into a single string

# Additional text provided in the code
user_text = '''Natural Language Processing with Python can be fun and exciting. 
It enables computers to understand human language and perform various tasks, 
from translation to sentiment analysis. Exploring NLP opens up many opportunities.'''

# Combine both texts
#combined_text = csv_text + " " + user_text

# Tokenization
#tokens = word_tokenize(combined_text.lower())
tokens = word_tokenize(user_text.lower())

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

# Count frequencies of each word
word_freq = Counter(filtered_tokens)

# Generate Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

# Plot the Word Cloud
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


### 7. Emojification and Demojification

In [None]:
!pip install emoji

In [None]:
!pip install demoji

In [None]:
import pandas as pd
import emoji
import demoji

# Load demoji data for emoji processing (run this only once)
#demoji.download_codes()

# Define functions for demojification and emoji removal
def demojify(text):
    """Convert emojis in text to their textual representation."""
    return emoji.demojize(text)
def emojify(text):
    """Convert text with emoji aliases to actual emojis."""
    return emoji.emojize(text)
# Sample text for demonstration
sample_texts = ["Hello :face_with_rolling_eyes: ! I code on my own :beaming_face_with_smiling_eyes:",
               "I love programming! 🚀 Let's build amazing things! 🌟"]
# Process sample texts
for original_text in sample_texts:
    print(f"Original Text: {original_text}")
    #print('-'*125)
    # Emojification
    emojified_text = emojify(original_text)
    print(f"Emojified Text: {emojified_text}")
    #print('-'*125)
    # Demojification
    demojified_text = demojify(original_text)
    print(f"Demojified Text: {demojified_text}\n")
    print('-'*125)

###  Emojification and Demojification with csv file

In [None]:
import pandas as pd
import emoji

# Define functions for demojification and emojification
def demojify(text):
    """Convert emojis in text to their textual representation."""
    return emoji.demojize(text)

def emojify(text):
    """Convert text with emoji aliases to actual emojis."""
    return emoji.emojize(text)

# CSV Processing
try:
    # Load text from a CSV file
    df = pd.read_csv('data.csv')  # Ensure 'data.csv' is in the same directory

    # Process each row in the DataFrame
    for original_text in df['text']:  # Assumes the CSV has a 'text' column
        # Emojification
        emojified_text = emojify(original_text)
        
        # Demojification
        demojified_text = demojify(original_text)

        # Print the results
        print(f"Original Text: {original_text}")
        print(f"Emojified Text: {emojified_text}")
        print(f"Demojified Text: {demojified_text}\n")

except FileNotFoundError:
    print("CSV file not found. Please ensure 'data.csv' is in the same directory.")

### 8. Sentiment Analysis - using TextBlob

In [None]:
from textblob import TextBlob
import pandas as pd

corpus = ["I absolutely loved the new restaurant downtown! The food was amazing, and the service was excellent.",
          "The movie was fantastic! The acting was top-notch, and the storyline was incredibly engaging.",
          "I had a wonderful experience with the customer support team. They were prompt and resolved my issue quickly.",
          "The product I received was defective and didn't work as advertised. I'm very disappointed.",
          "I had a terrible experience at the hotel. The room was dirty, and the staff was unhelpful.",
          "The concert was a huge letdown. The sound quality was poor, and the performance was lackluster.",
          "The book was well-written, but it didn't really capture my interest.",
          "I received my package on time, but the packaging was a bit damaged.",
          "The workshop was informative, but it was a bit too long for my liking."
         ]

df = pd.DataFrame(corpus,columns=['text'])

def get_polarity_score(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity,blob.sentiment.subjectivity
def get_analysis(score):
    if score>0:
        return "Positive"
    elif score==0:
        return "neutral"
    else:
        return "negative"

df[['polarity','subjectivity']] = df.text.apply(get_polarity_score).apply(pd.Series)
df['analysis'] = df['polarity'].apply(get_analysis)
display(df)

#plotting
# Calculate the percentage of each sentiment
sentiment_counts = df['analysis'].value_counts(normalize=True) * 100

# Plot the sentiment distribution
labels = sentiment_counts.index
sizes = sentiment_counts.values
colors = ['gold', 'pink', 'red']
explode = (0.1, 0, 0)  # explode 1st slice (Positive)

plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.title('Sentiment Analysis Distribution')
plt.show()

### sentiment analysis using SentimentIntensityAnalyzer

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

sia = SentimentIntensityAnalyzer()

corpus = ["I absolutely loved the new restaurant downtown! The food was amazing, and the service was excellent.",
          "The movie was fantastic! The acting was top-notch, and the storyline was incredibly engaging.",
          "I had a wonderful experience with the customer support team. They were prompt and resolved my issue quickly.",
          "The product I received was defective and didn't work as advertised. I'm very disappointed.",
          "I had a terrible experience at the hotel. The room was dirty, and the staff was unhelpful.",
          "The concert was a huge letdown. The sound quality was poor, and the performance was lackluster.",
          "The book was well-written, but it didn't really capture my interest.",
          "I received my package on time, but the packaging was a bit damaged.",
          "The workshop was informative, but it was a bit too long for my liking."
         ]

df = pd.DataFrame(corpus,columns=['text'])
def get_sentiment_score(text):
    return sia.polarity_scores(text)
df["sentiment_score"] = df.text.apply(get_sentiment_score)
display(df)

### sentiment analysis using SentimentIntensityAnalyzer with plot

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt

# Initialize Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Sample corpus
corpus = [
    "I absolutely loved the new restaurant downtown! The food was amazing, and the service was excellent.",
    "The movie was fantastic! The acting was top-notch, and the storyline was incredibly engaging.",
    "I had a wonderful experience with the customer support team. They were prompt and resolved my issue quickly.",
    "The product I received was defective and didn't work as advertised. I'm very disappointed.",
    "I had a terrible experience at the hotel. The room was dirty, and the staff was unhelpful.",
    "The concert was a huge letdown. The sound quality was poor, and the performance was lackluster.",
    "The book was well-written, but it didn't really capture my interest.",
    "I received my package on time, but the packaging was a bit damaged.",
    "The workshop was informative, but it was a bit too long for my liking."
]

# Create a DataFrame
df = pd.DataFrame(corpus, columns=['text'])

# Function to get sentiment scores
def get_sentiment_score(text):
    return sia.polarity_scores(text)

# Apply the sentiment score function
df["sentiment_score"] = df.text.apply(get_sentiment_score)

# Extract compound score for simplicity
df['compound'] = df['sentiment_score'].apply(lambda score_dict: score_dict['compound'])

# Define a function to categorize sentiment based on compound score
def analyze_sentiment(compound):
    if compound >= 0.05:
        return 'Positive'
    elif compound <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the function to categorize sentiments
df['analysis'] = df['compound'].apply(analyze_sentiment)

# Calculate the percentage of each sentiment
sentiment_counts = df['analysis'].value_counts(normalize=True) * 100

# Plot the sentiment distribution as a pie chart
labels = sentiment_counts.index
sizes = sentiment_counts.values
colors = ['gold', 'lightblue', 'salmon']

# Explode the Positive slice for emphasis
explode = [0.1 if label == 'Positive' else 0 for label in labels]

plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Sentiment Analysis Distribution')
plt.show()


### sentiment analysis with csv

In [None]:
import nltk
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt

df = pd.read_csv(r"C:\Users\Lavanyabh\Desktop\Sem3\NLP\datasets\Tweets.csv")

df.drop(columns = ['textID','selected_text','sentiment'],inplace = True)

#Function to get polarity and sunbjectivity using TextBlob
def get_polarity_subjectivity(df):
    blob = TextBlob(str(df))
    return blob.sentiment.polarity, blob.sentiment.subjectivity

#function to get sentiment analysis based on polarity
def get_analysis(score):
    if score < 0:
         return 'Negative'
    elif score == 0:
         return 'Neutral'
    else:
        return 'Positive'

#Apply the function to the DataFrame
df[['polarity','subjectivity']] = df.text.apply(get_polarity_subjectivity).apply(pd.Series)

#Apply the get_analysis function to get sentiment analysis
df["analysis"]=df["polarity"].apply(get_analysis)

display(df)


df

#plotting
# Calculate the percentage of each sentiment
sentiment_counts = df['analysis'].value_counts(normalize=True) * 100

# Plot the sentiment distribution
labels = sentiment_counts.index
sizes = sentiment_counts.values
colors = ['gold', 'pink', 'red']
explode = (0.1, 0, 0)  # explode 1st slice (Positive)

plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.title('Sentiment Analysis Distribution')
plt.show()

### 9. AFINN Sentiment Analysis

In [None]:
!pip install afinn

In [None]:
from afinn import Afinn
import pandas as pd
import matplotlib.pyplot as plt

# Initialize Afinn for sentiment analysis
afinn = Afinn()

# Sample corpus
corpus = [
    "I absolutely loved the new restaurant downtown! The food was amazing, and the service was excellent.",
    "The movie was fantastic! The acting was top-notch, and the storyline was incredibly engaging.",
    "I had a wonderful experience with the customer support team. They were prompt and resolved my issue quickly.",
    "The product I received was defective and didn't work as advertised. I'm very disappointed.",
    "I had a terrible experience at the hotel. The room was dirty, and the staff was unhelpful.",
    "The concert was a huge letdown. The sound quality was poor, and the performance was lackluster.",
    "The book was well-written, but it didn't really capture my interest.",
    "I received my package on time, but the packaging was a bit damaged.",
    "The workshop was informative, but it was a bit too long for my liking."
]

# Create a DataFrame
df = pd.DataFrame(corpus, columns=['text'])

# Apply Afinn sentiment analysis
df['afinn_score'] = df['text'].apply(afinn.score)

# Classify sentiment based on the score
def analyze_afinn_sentiment(score):
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'

df['analysis'] = df['afinn_score'].apply(analyze_afinn_sentiment)

# Display the DataFrame
display(df)

# Plotting the sentiment distribution as a pie chart
sentiment_counts = df['analysis'].value_counts(normalize=True) * 100

# Plot the sentiment distribution as a pie chart
labels = sentiment_counts.index
sizes = sentiment_counts.values
colors = ['gold', 'lightblue', 'salmon']

# Explode the Positive slice for emphasis
explode = [0.1 if label == 'Positive' else 0 for label in labels]

plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('AFINN Sentiment Analysis Distribution')
plt.show()


### Afinn Sentiment Analysis with csv file

In [None]:
import pandas as pd
from afinn import Afinn
import matplotlib.pyplot as plt

# Initialize Afinn for sentiment analysis
afinn = Afinn()
# Load CSV file (replace 'your_file.csv' with the path to your file)
df = pd.read_csv(r"C:\Users\Lavanyabh\Desktop\Sem3\NLP\datasets\Tweets.csv")
# Check for missing values and convert non-strings to empty strings
df['text'] = df['text'].fillna('').astype(str)
# Apply Afinn sentiment analysis on the 'text' column
df['afinn_score'] = df['text'].apply(afinn.score)
# Classify sentiment based on the score
def analyze_afinn_sentiment(score):
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'
df['analysis'] = df['afinn_score'].apply(analyze_afinn_sentiment)
# Display the DataFrame (optional, for verification)
display(df[['text', 'afinn_score', 'analysis']])

# Plotting the sentiment distribution as a pie chart
sentiment_counts = df['analysis'].value_counts(normalize=True) * 100
# Plot the sentiment distribution as a pie chart
labels = sentiment_counts.index
sizes = sentiment_counts.values
colors = ['gold', 'lightblue', 'salmon']
# Explode the Positive slice for emphasis
explode = [0.1 if label == 'Positive' else 0 for label in labels]
plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('AFINN Sentiment Analysis Distribution')
plt.show()

### 10. Named Entity Recognition

In [None]:
import spacy
import pandas as pd

# Load the spaCy model for English (small model)
#nlp = spacy.load("en_core_web_sm")

# Define a function to extract named entities
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Sample text for NER
sample_text = "Apple is looking at buying U.K. startup for $1 billion. Elon Musk, the CEO of Tesla, tweeted on Tuesday."

# Perform NER on the sample text
doc = nlp(sample_text)
print("Named Entities in sample text:")
for ent in doc.ents:
    print(ent.text, ent.label_)

# --- Named Entity Recognition from CSV file ---
# Assuming the CSV file has a column 'text' containing sentences or documents
#df = pd.read_csv(r"C:\Users\Lavanyabh\Desktop\Sem3\NLP\datasets\Tweets.csv")
# Apply NER to each row in the 'text' column
#df['entities'] = df['text'].apply(lambda x: extract_entities(str(x)))
# Display the results
#print("\nNamed Entities in CSV file:")
#print(df[['text', 'entities']])
# Save the results to a new CSV file (optional)
#df.to_csv(r"C:\Users\Lavanyabh\Desktop\Sem3\NLP\datasets\\save_entities.csv", index=False)

In [None]:
import spacy
from spacy import displacy
import pandas as pd
from transformers import pipeline

# Load spaCy NER model
nlp_spacy = spacy.load("en_core_web_sm")

# Load Hugging Face NER pipeline
ner_transformers = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

# Define a function for spaCy NER
def perform_spacy_ner(text):
    """Apply NER using spaCy and return the entities found in the text."""
    doc = nlp_spacy(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Define a function for Transformers NER
def perform_transformers_ner(text):
    """Apply NER using Transformers and return the entities found in the text."""
    entities = ner_transformers(text)
    return [(ent['word'], ent['entity_group']) for ent in entities]

# Sample text data (can be replaced with a CSV file)
texts = [
    "Barack Obama was the 44th President of the United States.",
    "Apple Inc. is based in Cupertino, California.",
    "Mount Everest is the highest mountain in the world.",
    "The World War II ended in 1945.",
    "The iPhone 12 was released by Apple in 2020.",
    "To Kill a Mockingbird is a famous novel.",
    "NASA launched the Mars Rover in 2021.",
    "Paris is the capital of France.",
    "I paid $500 for a new laptop.",
    "The Eiffel Tower is 324 meters tall.",
    "She was born on July 4th, 1990."
]

# Create a DataFrame to store the texts and their corresponding entities
df = pd.DataFrame(texts, columns=['text'])

# Apply the NER functions to the DataFrame
df['spacy_entities'] = df['text'].apply(perform_spacy_ner)
df['transformers_entities'] = df['text'].apply(perform_transformers_ner)

# Display the DataFrame with the recognized entities
print(df)

# Optional: Visualize one of the NER outputs using spaCy's displacy
# You can visualize any example text with NER highlighted
doc = nlp_spacy(texts[0])  # Change index to visualize a different text
displacy.render(doc, style="ent", jupyter=True)

### 11. Similarity Checking

### words similarity

In [None]:
"!python -m spacy download en_core_web_md "
import spacy
from nltk.corpus import wordnet
# Load the spacy model
nlp = spacy.load("en_core_web_md")
def word_similarity(word1, word2):
    """
    Calculate similarity between two words using spaCy's pre-trained model.
    """
    token1 = nlp(word1)
    token2 = nlp(word2)
    return token1.similarity(token2)

word1 = "teacher"
word2 = "instructor"
word_sim = word_similarity(word1, word2)

print(f"Similarity between words '{word1}' and '{word2}': {word_sim:.4f}")

### sentence similiarity

In [None]:
import spacy

# Load a pre-trained model
nlp = spacy.load('en_core_web_md')  # 'en_core_web_md' is a medium-sized English model with word vectors

def sentence_similarity(s1, s2):
    """
    Calculate similarity between two sentences using spaCy's pre-trained model.
    """
    doc1 = nlp(s1)
    doc2 = nlp(s2)
    return doc1.similarity(doc2)

# Example usage
s1 = "The quick brown fox jumps over the lazy dog."
s2 = "A fast dark fox leaps over a sleepy dog."
sent_sim = sentence_similarity(s1, s2)

print(f"Similarity between sentences:\n'{s1}'\nand\n'{s2}': {sent_sim:.4f}")


### word and sentence similarity

In [None]:
# First, ensure you have the required libraries
# You may need to run the following command if you haven't already downloaded the spaCy model:
# !python -m spacy download en_core_web_md

import spacy
from nltk.corpus import wordnet

# Load the spaCy model
nlp = spacy.load("en_core_web_md")

def word_similarity_spacy(word1, word2):
    """
    Calculate similarity between two words using spaCy's pre-trained model.
    """
    token1 = nlp(word1)
    token2 = nlp(word2)
    return token1.similarity(token2)


def sentence_similarity(sentence1, sentence2):
    """
    Calculate similarity between two sentences using spaCy's pre-trained model.
    """
    doc1 = nlp(sentence1)
    doc2 = nlp(sentence2)
    return doc1.similarity(doc2)

# Example usage for word similarity
word1 = "teacher"
word2 = "instructor"

# Similarity using spaCy
word_sim_spacy = word_similarity_spacy(word1, word2)
print(f"Similarity between words '{word1}' and '{word2}' (spaCy): {word_sim_spacy:.4f}")


# Example usage for sentence similarity
sentence1 = "The quick brown fox jumps over the lazy dog."
sentence2 = "A fast dark fox leaps over a sleepy dog."

sent_sim = sentence_similarity(sentence1, sentence2)
print(f"Similarity between sentences:\n'{sentence1}'\nand\n'{sentence2}': {sent_sim:.4f}")

### 12. Spam Detection

### spam detection with csv file

In [None]:
#to check wherter the incoming sms is spam or not spam
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
#1. Data loading
data = pd.read_csv(r'C:\Users\Lavanyabh\Desktop\Sem3\NLP\datasets\spam.csv', encoding='ISO-8859-1')
#selecting only relevant columns
data = data[['v1','v2']]
display(data)
#renaming columns for clarity
data.columns =  ['label','message']
display(data)
# 2. pre processing
data['label'] = data['label'].map({'ham':0,'spam':1})
display(data)
#3. feature extraction
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['message'])
y = data['label']
#4. Train-Test Split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
# 5. Model Training
model = MultinomialNB()
model.fit(X_train, y_train)
# 6. Model Evaluation
y_pred = model.predict(X_test)

#accuracy
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

#testing
# 7. Function to Predict Spam or Ham
def predict_message(text):
 text_transformed = vectorizer.transform([text])
 prediction = model.predict(text_transformed)
 return 'spam' if prediction[0] == 1 else 'ham'

# Example Usage
sample_text = "Congratulations! You've won a free ticket to Bahamas. Call now to claim."
prediction = predict_message(sample_text)
print(f"The message '{sample_text}' is classified as: {prediction}")

### spam detection with text

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Sample Data Creation
data = pd.DataFrame({
    'label': ['ham', 'spam', 'ham', 'spam', 'ham', 'spam'],
    'message': [
        "Hello, how are you?",
        "Congratulations! You've won a free ticket to Bahamas. Call now to claim.",
        "Are we still meeting tomorrow?",
        "You have been selected for a chance to get a free iPhone!",
        "Your appointment is confirmed.",
        "Free money!!! Click this link to claim your prize."
    ]
})

# 2. Renaming columns for clarity
data.columns = ['label', 'message']

# 3. Preprocessing
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# 4. Feature extraction
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['message'])
y = data['label']

# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Model Training
model = MultinomialNB()
model.fit(X_train, y_train)

# 7. Model Evaluation
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

# 8. Function to Predict Spam or Ham
def predict_message(text):
    text_transformed = vectorizer.transform([text])
    prediction = model.predict(text_transformed)
    return 'spam' if prediction[0] == 1 else 'ham'

# Example Usage
sample_text = "Congratulations! You've won a free ticket to Bahamas. Call now to claim."
prediction = predict_message(sample_text)
print(f"The message '{sample_text}' is classified as: {prediction}")

# Testing additional messages
additional_messages = [
    "Let's schedule a meeting next week.",
    "You've been selected for a $1000 cash prize!",
    "Can you send me the report by tomorrow?"
]

for msg in additional_messages:
    result = predict_message(msg)
    print(f"The message '{msg}' is classified as: {result}")


### 13. Grammar Checking

In [None]:
#!pip install language-tool-python

In [None]:
import language_tool_python
def grammar_check(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    #print(matches)
    
    print("Original Text: ")
    print(text)
    
    corrected_text = tool.correct(text)
    print("Corrected text: ")
    print(corrected_text)
    
text = "This is a example of a sentence with a error. Their is a problem with grammar."
grammar_check(text)

In [None]:
import language_tool_python

# Initialize the LanguageTool object
tool = language_tool_python.LanguageTool('en-US')

def check_grammar(text):
    # Check for grammar and spelling errors
    matches = tool.check(text)
    return matches

# Example text to check
text = "This is a example of a sentence with a error. Their is a problem with grammar."

# Check grammar
errors = check_grammar(text)

# Display the results
if errors:
    print(f"Found {len(errors)} error(s):")
    for error in errors:
        print(f" - Error: {error.context}")
        print(f"   Suggestion: {error.replacements}")
        print(f"   Message: {error.message}")
else:
    print("No grammatical errors found.")

### 14. N-Grams

In [None]:
import nltk
from nltk.util import ngrams
from collections import Counter

# Download NLTK data files (if you haven't done so)
#nltk.download('punkt')

def generate_ngrams(text, n):
    """
    Generate n-grams from the input text.

    :param text: The input text (string).
    :param n: The number of words in each n-gram.
    :return: A list of n-grams.
    """
    # Tokenize the text into words
    tokens = nltk.word_tokenize(text)

    # Generate n-grams
    n_grams = ngrams(tokens, n)

    return list(n_grams)

def count_ngrams(ngrams_list):
    """
    Count occurrences of n-grams.

    :param ngrams_list: A list of n-grams.
    :return: A Counter object with n-grams as keys and their counts as values.
    """
    return Counter(ngrams_list)

# Example usage
text = "I love natural language processing. Natural language processing is fascinating."
n = 2  # Change this value for different n-grams (e.g., 1 for unigrams, 2 for bigrams, etc.)

# Generate n-grams
ngrams_list = generate_ngrams(text, n)

# Count n-grams
ngrams_count = count_ngrams(ngrams_list)

# Display the n-grams and their counts
print(f"{n}-grams:")
for gram, count in ngrams_count.items():
    print(f"{gram}: {count}")

### 15. Topic Modelling

In [None]:
import pandas as pd
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data files (if you haven't done so)
#nltk.download('punkt')
#nltk.download('stopwords')

# Sample data: Replace this with your own dataset or CSV file
documents = [
    "I love reading about artificial intelligence and machine learning.",
    "Natural language processing is a fascinating field of study.",
    "Deep learning techniques are widely used in computer vision.",
    "Artificial intelligence can help automate many tasks.",
    "The future of AI is very promising with new advancements."
]

# Preprocessing
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Prepare data
processed_docs = [preprocess_text(doc) for doc in documents]

# Create a dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Build the LDA model
num_topics = 2  # Adjust the number of topics as needed
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# Display the topics
topics = lda_model.print_topics(num_words=3)  # Change num_words to display more/less words per topic
for topic in topics:
    print(topic)

# Example of assigning topics to documents
for i, doc in enumerate(corpus):
    print(f"Document {i + 1}: {documents[i]}")
    doc_topics = lda_model.get_document_topics(doc)
    print("Assigned Topics:", doc_topics)
    print()

### 16. Fuzzy Matching

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Sample data
choices = [
    "apple pie",
    "apple tart",
    "banana split",
    "cherry pie",
    "chocolate cake"
]

# Single string comparison
str1 = "apple"
str2 = "apple pie"
similarity_score = fuzz.ratio(str1, str2)
print(f"Similarity between '{str1}' and '{str2}': {similarity_score}%")

# Partial string matching
str3 = "appl"
partial_score = fuzz.partial_ratio(str1, str3)
print(f"Partial similarity between '{str1}' and '{str3}': {partial_score}%")

# Find the best match in a list
query = "apple"
best_match = process.extractOne(query, choices)
print(f"The best match for '{query}' is '{best_match[0]}' with a score of {best_match[1]}%")

# Example for multiple matches
results = process.extract(query, choices, limit=3)
print(f"Top 3 matches for '{query}':")
for match in results:
    print(f"{match[0]} - {match[1]}%")

In [None]:
!pip install fuzzywuzzy

In [None]:
import pandas as pd
from thefuzz import process, fuzz

def load_data_from_csv(file_path):
    """
    Load data from a CSV file containing company names.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    DataFrame: A pandas DataFrame with company names.
    """
    return pd.read_csv(file_path)

def standardize_company_names(df, threshold=80):
    """
    Standardize company names in a DataFrame using fuzzy matching.

    Parameters:
    df (DataFrame): A DataFrame containing a column of company names.
    threshold (int): The minimum score to consider a match (default is 80).

    Returns:
    DataFrame: A DataFrame with an additional column for standardized company names.
    """
    # Create a dictionary to map original names to standardized names
    standard_name_map = {}

    # Iterate over each company name
    for name in df['Company Name']:
        if name not in standard_name_map:
            # Find the best match for the current name within existing names in the map
            match_result = process.extractOne(name, standard_name_map.keys(), scorer=fuzz.partial_ratio, score_cutoff=threshold)
            
            if match_result:
                match, _ = match_result
                standard_name_map[name] = standard_name_map[match]
            else:
                standard_name_map[name] = name

    # Replace the names in the DataFrame with the standardized names
    df['Standardized Company Name'] = df['Company Name'].map(standard_name_map)

    return df

# Example Usage
file_path = 'company_names.csv'  # Replace with your CSV file path
df = load_data_from_csv(file_path)

# Ensure the DataFrame has a column named 'Company Name'
if 'Company Name' in df.columns:
    df_standardized = standardize_company_names(df)
    print("\nStandardized Data:\n", df_standardized)
else:
    print("The DataFrame does not contain a column named 'Company Name'.")


### cosine similarity

In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
sentence_1 = "I love programming in Python"
sentence_2 = "Python is a great language for programming"
sentences = [sentence_1, sentence_2]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(f"Cosine Similarity between sentence 1 and sentence 2: {cosine_sim[0][0]}")

Cosine Similarity between sentence 1 and sentence 2: 0.26055567105626243
