In [1]:
# Text Analysis Notebook

# Step 1: Import necessary libraries and modules
import os
import sys
import pandas as pd

# Adjust the path to import from src directory
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(''), '..', 'src')))

# Import the TextAnalysis class
from text_analysis import TextAnalysis

# Step 2: Load the data
base_path = os.path.abspath(os.path.join(os.path.dirname(''), '..', 'data'))
data_path = os.path.join(base_path, 'raw_analyst_ratings.csv/raw_analyst_ratings.csv')

# Check if the file exists
if not os.path.exists(data_path):
    print(f"Data file not found: {data_path}")
else:
    dataframe = pd.read_csv(data_path)
    print("Data Loaded Successfully")

# Step 3: Initialize TextAnalysis and clean the text data
text_analysis = TextAnalysis(dataframe)

# Clean the 'headline' column in the dataframe
cleaned_headlines = text_analysis.clean_text('headline')
print("Text data cleaned successfully")

# Display the first few cleaned headlines
display(cleaned_headlines.head())

# Step 4: Extract and display common phrases (bigrams)
# Get the top 10 common bigrams in the 'headline' column
common_phrases = text_analysis.get_common_phrases('headline', ngram_range=(2, 2), n=10)

# Print the top 10 common bigrams
print("Top 10 Common Bigrams in Headlines:")
for phrase, count in common_phrases:
    print(f"{phrase}: {count}")


Data Loaded Successfully
Text data cleaned successfully


0              stocks that hit 52 week highs on friday
1           stocks that hit 52 week highs on wednesday
2                        71 biggest movers from friday
3         46 stocks moving in friday s mid day session
4    b of a securities maintains neutral on agilent...
Name: headline, dtype: object

Top 10 Common Bigrams in Headlines:
52 week: 51006
price target: 47274
benzinga top: 44259
are trading: 40497
stocks moving: 39978
moving in: 39743
mid day: 37324
shares of: 34841
market update: 33101
scheduled for: 32252
