In [None]:
pip install numpy==1.24

In [None]:
import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from spacy import displacy
from spacy.matcher import PhraseMatcher
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from spacy.lang.en import English

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [None]:
data = pd.read_pickle('mergerdata.pkl')

In [None]:
data

In [None]:
# Step 3: Sentence Tokenization (using NLTK)
data['sentences'] = data['comments'].apply(lambda x: sent_tokenize(x))

In [None]:
# Step 4: Lowercasing and Removing Punctuation
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation (SpaCy already handles punctuation during processing)
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    
    return text

In [None]:
# Apply cleaning function to each sentence
data['cleaned_sentences'] = data['sentences'].apply(lambda sentences: [clean_text(sentence) for sentence in sentences])

In [None]:
# Step 5: Stemming (optional, more aggressive than Lemmatization)
def apply_stemming(sentence):
    ps = PorterStemmer()
    words = word_tokenize(sentence)
    return ' '.join([ps.stem(word) for word in words])

In [None]:
# Apply stemming to each sentence
data['stemmed_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [apply_stemming(sentence) for sentence in sentences])

In [None]:
spacy.cli.download("en_core_web_sm")

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# Step 6: Lemmatization using SpaCy
def lemmatize_sentence(sentence):
    doc = nlp(sentence)
    return ' '.join([token.lemma_ for token in doc])

In [None]:
# Apply Lemmatization
data['lemmatized_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [lemmatize_sentence(sentence) for sentence in sentences])

In [None]:
data.head()

In [None]:
for text in data['lemmatized_sentences']:
    print(text)

In [None]:
# Step 7: Part-of-Speech (POS) Tagging using NLTK (optional, more for understanding structure)
def pos_tagging(sentence):
    return pos_tag(word_tokenize(sentence))

In [None]:
# Apply POS tagging
data['pos_tagged_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [pos_tagging(sentence) for sentence in sentences])

In [None]:
# Step 8: Dependency Parsing using SpaCy
def dependency_parsing(sentence):
    doc = nlp(sentence)
    return [(token.text, token.dep_, token.head.text) for token in doc]

In [None]:
# Apply Dependency Parsing
data['dependency_parsed_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [dependency_parsing(sentence) for sentence in sentences])

In [None]:
data.head()

In [None]:
data.to_json('prepro__mergerdata.json')

In [None]:
data.to_csv('prepro_mergerdata.csv')

In [None]:
for bidder in data['bidders_name']:
    print(bidder)