In [None]:
# all neccesary imports
import re

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [None]:
#to download nltk dependencies
nltk.download('all')

In [None]:
df = pd.read_csv('../data/raw_analyst_ratings.csv')

In [None]:
df.head(5)

In [None]:
# Head Line Length
df['headline_len'] = len(df['headline'])

In [None]:
headline_stats = df['headline_len'].describe()
print("\n\tHeadline Stats:\n")
headline_stats

In [None]:
# Articles Per Publisher
pub_counts = df['publisher'].value_counts().head(10)

print("\n\tArticles per publisher:\n")

plt.figure(figsize=(5,3))
pub_counts.plot(kind ='bar')
plt.title("Top 10 publishers ")
plt.xlabel("publishers")
plt.xlabel("Publishers")
plt.ylabel("Number of Articles")

In [None]:
# changing date column into datetime format
df['date'] = pd.to_datetime(df['date'] , format = 'mixed' , utc = True)

df['day_of_week']= df['date'].dt.day_name()
df['monthly'] = df['date'].dt.month_name()
df['yearly'] = df['date'].dt.year

In [None]:
pub_week_days = df['day_of_week'].value_counts()
pub_month = df['monthly'].value_counts()
pub_year = df['yearly'].value_counts()


print("\nArticles per Day of the Week:")
print(pub_week_days)

print("\nArticles per Month:")
print(pub_month)

print("\nArticles per Year:")
print(pub_year)

In [None]:
# sentiment analysis
sia = SentimentIntensityAnalyzer()
df['sentiment'] = df['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

In [None]:
df[['headline','sentiment_category']].head(10)

In [None]:
print('\n\tVisualizing The sentiment result\n')

plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment_category', data=df)
plt.title('Sentiment Analysis of Headlines')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()


In [None]:
#all tokens from the headlines
tokens = [word_tokenize(headline) for headline in df['headline']]

# Flatten the list of tokens
flat_tokens = [token for sublist in tokens for token in sublist]

# Create a frequency distribution of the tokens
freq_dist = FreqDist(flat_tokens)

print("\n\tMost Common Tokens:")
for word, frequency in freq_dist.most_common(10):
    print(f"{word}: {frequency}")

# Identify bigrams and trigrams
bigrams = nltk.bigrams(flat_tokens)
trigrams = nltk.trigrams(flat_tokens)

#frequency distributions for bigrams and trigrams
bigram_freq_dist = FreqDist(bigrams)
trigram_freq_dist = FreqDist(trigrams)

# Print the most common bigrams and trigrams
print("\n\tMost Common Bigrams:")
for bigram, frequency in bigram_freq_dist.most_common(10):
    print(f"{bigram}: {frequency}")

print("\n\tMost Common Trigrams:")
for trigram, frequency in trigram_freq_dist.most_common(10):
    print(f"{trigram}: {frequency}")


In [None]:

df['date'] = pd.to_datetime(df['date'] , format = 'mixed' , utc = True)

df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday

# Group by year and count the number of articles published
articles_per_year = df.groupby('year')['headline'].count()

# Plot the number of articles published per year
plt.figure(figsize=(5, 3))
articles_per_year.plot(kind='bar')
plt.title('Articles Published per Year')
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.show()

# Group by month and count the number of articles published
articles_per_month = df.groupby('month')['headline'].count()

# Plot the number of articles published per month
plt.figure(figsize=(10, 6))
articles_per_month.plot(kind='bar')
plt.title('Articles Published per Month')
plt.xlabel('Month')
plt.ylabel('Number of Articles')
plt.show()

# Group by day of the week and count the number of articles published
articles_per_weekday = df.groupby('weekday')['headline'].count()

# Plot the number of articles published per day of the week
plt.figure(figsize=(10, 6))
articles_per_weekday.plot(kind='bar')
plt.title('Articles Published per Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Articles')
plt.show()

In [None]:
# Most article publishers
top_publishers = df['publisher'].value_counts().head(5)
plt.figure(figsize=(5,3))
top_publishers.plot(kind = 'bar')
plt.title('Top 5 most Article Publishers ')
plt.xlabel('Publishers')
plt.ylabel('number of articles')
plt.show()

In [None]:
publisher_sentiment = df.groupby('publisher')['sentiment_category'].value_counts().head(50)

# print(publisher_sentiment

plt.figure(figsize=(10,6))
publisher_sentiment.unstack().plot(kind = 'bar')
plt.title('sentiment Analysis by publishers Publishers ')
plt.xlabel('Publishers')
plt.ylabel('number of articles')
plt.show()

In [None]:

#email domains from publisher names
publisher_domains=[]
for publisher in df['publisher']:
  match= re.search(r'@(.*)',publisher)
  if match:
    publisher_domains.append(match.group(1))

# Count the frequency of each domain
domain_counts = pd.Series(publisher_domains).value_counts()

# top 5 domains with the most articles
top_domains = domain_counts.head(5)

# Plot the number of articles for the top 5 domains
plt.figure(figsize=(10, 6))
top_domains.plot(kind='bar')
plt.title('Top 5 Domains with the Most Articles')
plt.xlabel('Domain')
plt.ylabel('Number of Articles')
plt.show()
