In [None]:
# pip install numpy==1.24

In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from spacy import displacy
from spacy.matcher import PhraseMatcher
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from spacy.lang.en import English

In [74]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kbeni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kbeni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kbeni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kbeni\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [95]:
data = pd.read_pickle('mergerdata.pkl')
# data = pd.read_json('merger_data.json')

In [96]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825773 entries, 0 to 825772
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   date_completion           506135 non-null  object
 1   bidders_name              825773 non-null  object
 2   date_completion_assumed   161144 non-null  object
 3   date_last_status_update   175350 non-null  object
 4   status                    825773 non-null  object
 5   bidders_industry          825773 non-null  object
 6   id                        825773 non-null  int64 
 7   comments                  825773 non-null  object
 8   targets_industry          825773 non-null  object
 9   targets_isin              825773 non-null  object
 10  date_rumor                825773 non-null  object
 11  date_postponed            214 non-null     object
 12  date_announcement         696242 non-null  object
 13  bidders_isin              825773 non-null  object
 14  date

In [97]:
# filter data from before 2010-01-01
data = data[data['date_rumor'] >= '2010-01-01']

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 519721 entries, 1 to 825772
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   date_completion           302059 non-null  object
 1   bidders_name              519721 non-null  object
 2   date_completion_assumed   108961 non-null  object
 3   date_last_status_update   103980 non-null  object
 4   status                    519721 non-null  object
 5   bidders_industry          519721 non-null  object
 6   id                        519721 non-null  int64 
 7   comments                  519721 non-null  object
 8   targets_industry          519721 non-null  object
 9   targets_isin              519721 non-null  object
 10  date_rumor                519721 non-null  object
 11  date_postponed            104 non-null     object
 12  date_announcement         434568 non-null  object
 13  bidders_isin              519721 non-null  object
 14  date_comp

In [99]:
# filter industries by keywords
industries_data = pd.read_csv('industries.csv')

keywords_auto_oil = ["automobile", "vehicle", "car", "motor", "engine", "oil", "gas", "fuel", "petroleum"]
relevant_rows = industries_data[
    industries_data['description'].str.contains('|'.join(keywords_auto_oil), case=False, na=False)
]

# Extract the unique codes for these descriptions
relevant_codes = relevant_rows['code'].unique()

# change relevant_codes to strings
relevant_codes = [str(code) for code in relevant_codes]

relevant_codes = set(relevant_codes)

In [100]:
import ast


# Function to check if any relevant code is included
def compare_if_included(industries):
    if not industries:  # Handle empty or None values gracefully
        return False
    try:
        # Parse the entry if it's a string representation of a list
        if isinstance(industries, str):
            industries = ast.literal_eval(industries)
        # Check if any industry code matches the relevant codes
        return any(str(industry) in relevant_codes for industry in industries)
    except (ValueError, SyntaxError, TypeError):
        # Return False if parsing fails or industries is malformed
        return False

# Apply the function to the 'targets_industry' column
data['has_relevant_code'] = data['targets_industry'].apply(compare_if_included)

In [101]:
# Filter data to only include rows with relevant codes
data = data[data['has_relevant_code']]

In [102]:
data.drop(columns=['has_relevant_code'], inplace=True)

In [103]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36135 entries, 17 to 825761
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   date_completion           22097 non-null  object
 1   bidders_name              36135 non-null  object
 2   date_completion_assumed   7857 non-null   object
 3   date_last_status_update   7328 non-null   object
 4   status                    36135 non-null  object
 5   bidders_industry          36135 non-null  object
 6   id                        36135 non-null  int64 
 7   comments                  36135 non-null  object
 8   targets_industry          36135 non-null  object
 9   targets_isin              36135 non-null  object
 10  date_rumor                36135 non-null  object
 11  date_postponed            3 non-null      object
 12  date_announcement         31677 non-null  object
 13  bidders_isin              36135 non-null  object
 14  date_completion_expected 

In [75]:
# Step 3: Sentence Tokenization (using NLTK)
data['sentences'] = data['comments'].apply(lambda x: sent_tokenize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentences'] = data['comments'].apply(lambda x: sent_tokenize(x))


In [76]:
# Step 4: Lowercasing and Removing Punctuation
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation (SpaCy already handles punctuation during processing)
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    
    return text

In [77]:
# Apply cleaning function to each sentence
data['cleaned_sentences'] = data['sentences'].apply(lambda sentences: [clean_text(sentence) for sentence in sentences])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cleaned_sentences'] = data['sentences'].apply(lambda sentences: [clean_text(sentence) for sentence in sentences])


In [78]:
# Step 5: Stemming (optional, more aggressive than Lemmatization)
def apply_stemming(sentence):
    ps = PorterStemmer()
    words = word_tokenize(sentence)
    return ' '.join([ps.stem(word) for word in words])

In [79]:
# Apply stemming to each sentence
data['stemmed_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [apply_stemming(sentence) for sentence in sentences])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['stemmed_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [apply_stemming(sentence) for sentence in sentences])


In [80]:
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [81]:
nlp = spacy.load('en_core_web_sm')

In [82]:
# Step 6: Lemmatization using SpaCy
def lemmatize_sentence(sentence):
    doc = nlp(sentence)
    return ' '.join([token.lemma_ for token in doc])

In [83]:
# Apply Lemmatization
data['lemmatized_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [lemmatize_sentence(sentence) for sentence in sentences])

KeyboardInterrupt: 

In [None]:
data.head()

In [None]:
for text in data['lemmatized_sentences']:
    print(text)

In [None]:
# Step 7: Part-of-Speech (POS) Tagging using NLTK (optional, more for understanding structure)
def pos_tagging(sentence):
    return pos_tag(word_tokenize(sentence))

In [None]:
# Apply POS tagging
data['pos_tagged_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [pos_tagging(sentence) for sentence in sentences])

In [None]:
# Step 8: Dependency Parsing using SpaCy
def dependency_parsing(sentence):
    doc = nlp(sentence)
    return [(token.text, token.dep_, token.head.text) for token in doc]

In [None]:
# Apply Dependency Parsing
data['dependency_parsed_sentences'] = data['cleaned_sentences'].apply(lambda sentences: [dependency_parsing(sentence) for sentence in sentences])

In [None]:
data.head()

In [None]:
data.to_json('prepro__mergerdata.json')

In [None]:
data.to_csv('prepro_mergerdata.csv')

In [None]:
for bidder in data['bidders_name']:
    print(bidder)