<a href="https://colab.research.google.com/github/dorirozen/AI_Tasks/blob/main/NlpTask1_Dori_Shlomi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Note:  
in each place that we are printing something , we only printed a slice of [0:5] for not expanding the output cell result..

In [None]:
#@title Pips
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install beautifulsoup4
!pip install nltk

In [68]:
#@title Imports
import nltk,time,spacy,csv,requests
import pandas as pd
from bs4 import BeautifulSoup
from collections import Counter
from statistics import mean
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk import pos_tag

In [None]:
#@title Download the necessary NLTK resources
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

In [None]:
#@title constant
NLP = spacy.load("en_core_web_sm")

In [103]:
#@title Global Functions - printListsAfterActivation & token_stem_limmatizeing & analyze_array_data & csvfile & spam2text & word_freq & print_text_statistics Functions
def analyze_array_data(arr):
    '''
    print_text_statistics for list of lists
    '''
    all_words = sum(len(sub_array) for sub_array in arr)
    word_frequencies = Counter(word for sub_array in arr for word in sub_array)
    average_count = sum(len(sub_array) for sub_array in arr) / len(arr) if arr else 0

    most_common_words = word_frequencies.most_common(5)
    unique_words = sum(1 for count in word_frequencies.values() if count == 1)

    print(f"Total words: {all_words}")
    # print(f"Word frequencies: {word_frequencies}")
    print(f"Average words per message: {average_count:.2f}")
    print(f"Most common words: {', '.join(word for word, _ in most_common_words)}")
    print(f"Unique words appearing only once: {unique_words}")

def printListsAfterActivation(text):
  tokenized_sentences = [word_tokenize(sentence) for sentence in text]

  lemmatized_sentences = []
  stemmed_sentences = []

  for sentence in tokenized_sentences:
    lemmatized_sentence = []
    stemmed_sentence = []


    doc = NLP(' '.join(sentence))
    lemmatized_sentence = [token.lemma_ for token in doc]


    stemmed_sentence = [stemmer.stem(token) for token in sentence]

    lemmatized_sentences.append(lemmatized_sentence)
    stemmed_sentences.append(stemmed_sentence)


  print("\nTokenized Sentences:")
  print(tokenized_sentences[0:5])

  print("\nLemmatized Sentences:")
  print(lemmatized_sentences[0:5])

  print("\nStemmed Sentences:")
  print(stemmed_sentences[0:5])
  print("\n\n")

def token_stem_limmatizeing(text):
  tokenized_sentences = [word_tokenize(sentence) for sentence in text]
  lemmatized_sentences = []
  stemmed_sentences = []

  for sentence in tokenized_sentences:
    doc = NLP(' '.join(sentence))
    lemmatized_sentence = [token.lemma_ for token in doc]

    stemmed_sentence = [stemmer.stem(token) for token in sentence]

    lemmatized_sentences.append(lemmatized_sentence)
    stemmed_sentences.append(stemmed_sentence)

  print("Original Text Statistics:")
  analyze_array_data(tokenized_sentences)

  print("\nLemmatized Text Statistics:")
  analyze_array_data(lemmatized_sentences)

  print("\nStemmed Text Statistics:")
  analyze_array_data(stemmed_sentences)
  print("\n\n")

def csvfile(namefile):
    if ".csv" not in namefile:
        namefile += ".csv"
    csvfile = open(namefile, 'r', newline='', encoding = "ISO-8859-1")
    return list(csv.DictReader(csvfile))


def spam2text(spam):
  return [x['v2'] for x in spam]

def word_freq(text):
  word_dict = {}
  for string in text:
    for word in string.split(' '):
      try:
        word_dict[word] +=1
      except KeyError:
        word_dict[word] = 1

  return dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))

def print_text_statistics(text):
  text_freq = word_freq(text)
  print(f"Word Count: {sum([len(x.split(' ')) for x in text])}\n" +
        f"Average Word Count: {mean([len(x.split(' ')) for x in text])}\n" +
        f"5 Most Frequent Words : {[freq for freq in text_freq.keys()][0:5]}\n" +
        f"Number of Rare Words : {len([freq for freq in text_freq.values() if freq==1])}\n\n")



In [87]:
#@title Starts program here
spam = csvfile('/content/spam.csv')
print(f"Number of SMS Messages:{len(spam)}\n" +
      f"Number of Spams: {len([x for x in spam if x['v1']=='spam'])}\n"+
      f"Number of hams: {len([x for x in spam if x['v1']=='ham'])}\n")
spam_text = spam2text(spam)
print_text_statistics(spam_text)


Number of SMS Messages:5572
Number of Spams: 747
Number of hams: 4825

Word Count: 86961
Average Word Count: 15.60678391959799
5 Most Frequent Words : ['to', 'you', 'I', 'a', 'the']
Number of Rare Words : 9270




In [88]:
#@title tokenize with nltk and spacy
# Tokenize the SMS text using NLTK
start_time = time.perf_counter()
tokenized_sms_nltk = [word_tokenize(sms) for sms in spam_text]
end_time = time.perf_counter()
execution_time = end_time - start_time
print(f"Nltk Execution time: {execution_time:.2f} seconds\n\n")

print("NLTK Tokenization:\n\n")
sms_nltk = lambda i: print(tokenized_sms_nltk[i])
for i in range(4):
    sms_nltk(i)

# Tokenize the SMS text using Spacy
start_time = time.perf_counter()
tokenized_sms_spacy = [[token.text for token in NLP(sms)] for sms in spam_text]
end_time = time.perf_counter()
execution_time = end_time - start_time
print(f"Spacy Execution time: {execution_time:.2f} seconds\n\n")

print("spaCy Tokenization:\n")
sms_spacy = lambda i: print(tokenized_sms_spacy[i])
for i in range(4):
    sms_spacy(i)


Nltk Execution time: 1.00 seconds


NLTK Tokenization:


['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']
['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...']
['Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 'T', '&', 'C', "'s", 'apply', '08452810075over18', "'s"]
['U', 'dun', 'say', 'so', 'early', 'hor', '...', 'U', 'c', 'already', 'then', 'say', '...']
Spacy Execution time: 41.33 seconds


spaCy Tokenization:

['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']
['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...']
['Free', 'entry', 'in',

In [106]:
#@title lemmatizer part
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# Tokenize and lemmatize using NLTK
lemmatized_sms_nltk = []
for sms in spam_text:
    tokens = word_tokenize(sms)
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]
    lemmatized_sms_nltk.append(lemmatized_tokens)

print("NLTK Lemmatization:\n")
print(lemmatized_sms_nltk[0:5])
print("\n\n")

lemmatized_sms_spacy = [[token.lemma_ for token in NLP(sms)] for sms in spam_text]

print("spaCy Lemmatization:\n")
print(lemmatized_sms_spacy[0:5])
print("\n\n")
# Time Complexity Analysis:
# NLTK Lemmatization:
# - Tokenization: O(m), where m is the number of characters.
# - POS Tagging: O(n), where n is the number of tokens.
# - Lemmatization: O(n), where n is the number of tokens.
# Overall: O(m) + O(n) + O(n) = O(n), considering m ≈ n.

# spaCy Lemmatization:
# - Tokenization, POS Tagging, Lemmatization: O(n), where n is the number of tokens.
# Overall: O(n).

NLTK Lemmatization:

[['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'get', 'amore', 'wat', '...'], ['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...'], ['Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 'T', '&', 'C', "'s", 'apply', '08452810075over18', "'s"], ['U', 'dun', 'say', 'so', 'early', 'hor', '...', 'U', 'c', 'already', 'then', 'say', '...'], ['Nah', 'I', 'do', "n't", 'think', 'he', 'go', 'to', 'usf', ',', 'he', 'live', 'around', 'here', 'though']]



spaCy Lemmatization:

[['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'get', 'amore', 'wat', '...'], ['ok', 'lar', '...', 'joke', 'wif',

In [107]:
#@title stemmer part
# Initialize the stemmer
stemmer = PorterStemmer()


# Tokenize and stem using NLTK
stemmed_sms_nltk = []
for sms in spam_text:
    tokens = word_tokenize(sms)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_sms_nltk.append(stemmed_tokens)

print("NLTK Stemming:\n")
print(stemmed_sms_nltk[0:5])
print("\n\n")

# Tokenize using spaCy and stem using NLTK
stemmed_sms_spacy = []
for sms in spam_text:
    doc = NLP(sms)
    stemmed_tokens = [stemmer.stem(token.text) for token in doc]
    stemmed_sms_spacy.append(stemmed_tokens)

print("spaCy (with NLTK Stemming):\n")
print(stemmed_sms_spacy[0:5])
print("\n\n")
# Time Complexity Analysis:
# NLTK Stemming:
# - Tokenization: O(m), where m is the number of characters.
# - Stemming: O(n), where n is the number of tokens.
# Overall: O(m) + O(n) = O(n), considering m ≈ n.

# spaCy Stemming (using NLTK):
# - Tokenization: O(m), where m is the number of characters.
# - Stemming: O(n), where n is the number of tokens.
# Overall: O(n).

NLTK Stemming:

[['go', 'until', 'jurong', 'point', ',', 'crazi', '..', 'avail', 'onli', 'in', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amor', 'wat', '...'], ['ok', 'lar', '...', 'joke', 'wif', 'u', 'oni', '...'], ['free', 'entri', 'in', '2', 'a', 'wkli', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkt', '21st', 'may', '2005', '.', 'text', 'fa', 'to', '87121', 'to', 'receiv', 'entri', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', 'c', "'s", 'appli', '08452810075over18', "'s"], ['u', 'dun', 'say', 'so', 'earli', 'hor', '...', 'u', 'c', 'alreadi', 'then', 'say', '...'], ['nah', 'i', 'do', "n't", 'think', 'he', 'goe', 'to', 'usf', ',', 'he', 'live', 'around', 'here', 'though']]



spaCy (with NLTK Stemming):

[['go', 'until', 'jurong', 'point', ',', 'crazi', '..', 'avail', 'onli', 'in', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amor', 'wat', '...'], ['ok', 'lar', '...', 'joke', 'wif', 'u', 'oni', 

In [108]:
print("Stemming with nltk:\n\n")
analyze_array_data(stemmed_sms_nltk)
print("\n")
print("Stemming with spacy:\n\n")
analyze_array_data(stemmed_sms_spacy)


Stemming with nltk:


Total words: 104193
Average words per message: 18.70
Most common words: ., i, to, you, ,
Unique words appearing only once: 4179


Stemming with spacy:


Total words: 103533
Average words per message: 18.58
Most common words: ., i, to, you, ,
Unique words appearing only once: 4252


In [109]:
print("Lemmatized with nltk:\n\n")
analyze_array_data(lemmatized_sms_nltk)
print("\n")
print("Lemmatized with spacy:\n\n")
analyze_array_data(lemmatized_sms_spacy)


Lemmatized with nltk:


Total words: 104193
Average words per message: 18.70
Most common words: ., be, to, I, you
Unique words appearing only once: 5693


Lemmatized with spacy:


Total words: 103533
Average words per message: 18.58
Most common words: ., I, be, to, you
Unique words appearing only once: 5359


In [113]:
#@title BeautifulSoup part
websites = ['https://afgprogrammer.com/flutter/',
            'https://huggingface.co/docs/transformers/model_doc/hubert']

scrap_sentences = []

for web in websites:
    soup = BeautifulSoup(requests.get(web).text,'html.parser')
    for p in soup.find_all('p'):
        scrap_sentences.append(p.text)


print("Scraped Sentences:\n")
print(scrap_sentences[0:5])

Scraped Sentences:

['Search in 100+ Flutter Examples ', 'Day 64 of Flutter 100 days of code.', 'Day 63 of Flutter 100 days of code.', 'Day 62 of Flutter 100 days of code.', 'Day 61 of Flutter 100 days of code.']


In [114]:
printListsAfterActivation(scrap_sentences)
token_stem_limmatizeing(scrap_sentences)


Tokenized Sentences:
[['Search', 'in', '100+', 'Flutter', 'Examples'], ['Day', '64', 'of', 'Flutter', '100', 'days', 'of', 'code', '.'], ['Day', '63', 'of', 'Flutter', '100', 'days', 'of', 'code', '.'], ['Day', '62', 'of', 'Flutter', '100', 'days', 'of', 'code', '.'], ['Day', '61', 'of', 'Flutter', '100', 'days', 'of', 'code', '.']]

Lemmatized Sentences:
[['search', 'in', '100', '+', 'flutter', 'example'], ['day', '64', 'of', 'Flutter', '100', 'day', 'of', 'code', '.'], ['day', '63', 'of', 'Flutter', '100', 'day', 'of', 'code', '.'], ['day', '62', 'of', 'Flutter', '100', 'day', 'of', 'code', '.'], ['day', '61', 'of', 'Flutter', '100', 'day', 'of', 'code', '.']]

Stemmed Sentences:
[['search', 'in', '100+', 'flutter', 'exampl'], ['day', '64', 'of', 'flutter', '100', 'day', 'of', 'code', '.'], ['day', '63', 'of', 'flutter', '100', 'day', 'of', 'code', '.'], ['day', '62', 'of', 'flutter', '100', 'day', 'of', 'code', '.'], ['day', '61', 'of', 'flutter', '100', 'day', 'of', 'code', '.']]


In [111]:
chat = open("/content/whatsup.txt", 'r')
chat_text = chat.readlines()
print(chat_text[0:5])


['הוא מוגש עם המסמך הסופי בתאריך שהוגדר לכם.\n', 'בדמו או בספר הפרויקט?\n', 'יש שם 2 שלבים\n', ' למיטב זכרוני, מועד הגשת הטיוטה הוא בשביל להגיע מוכנים לתערוכה, כלומר - עד אליו.\n', 'אחרי כן יהיה זמן לתיקונים קטנים מעטים במידת הצורך.\n']


In [115]:
printListsAfterActivation(chat_text)
token_stem_limmatizeing(chat_text)


Tokenized Sentences:
[['הוא', 'מוגש', 'עם', 'המסמך', 'הסופי', 'בתאריך', 'שהוגדר', 'לכם', '.'], ['בדמו', 'או', 'בספר', 'הפרויקט', '?'], ['יש', 'שם', '2', 'שלבים'], ['למיטב', 'זכרוני', ',', 'מועד', 'הגשת', 'הטיוטה', 'הוא', 'בשביל', 'להגיע', 'מוכנים', 'לתערוכה', ',', 'כלומר', '-', 'עד', 'אליו', '.'], ['אחרי', 'כן', 'יהיה', 'זמן', 'לתיקונים', 'קטנים', 'מעטים', 'במידת', 'הצורך', '.']]

Lemmatized Sentences:
[['הוא', 'מוגש', 'עם', 'המסמך', 'הסופי', 'בתאריך', 'שהוגדר', 'לכם', '.'], ['בדמו', 'או', 'בספר', 'הפרויקט', '?'], ['יש', 'שם', '2', 'שלבים'], ['למיטב', 'זכרוני', ',', 'מועד', 'הגשת', 'הטיוטה', 'הוא', 'בשביל', 'להגיע', 'מוכנים', 'לתערוכה', ',', 'כלומר', '-', 'עד', 'אליו', '.'], ['אחרי', 'כן', 'יהיה', 'זמן', 'לתיקונים', 'קטנים', 'מעטים', 'במידת', 'הצורך', '.']]

Stemmed Sentences:
[['הוא', 'מוגש', 'עם', 'המסמך', 'הסופי', 'בתאריך', 'שהוגדר', 'לכם', '.'], ['בדמו', 'או', 'בספר', 'הפרויקט', '?'], ['יש', 'שם', '2', 'שלבים'], ['למיטב', 'זכרוני', ',', 'מועד', 'הגשת', 'הטיוטה', 'הוא', 'בשביל', 'ל

In [None]:
%%shell
jupyter nbconvert --to html /content/NlpTask1_Dori_Shlomi.ipynb