This notebook contains the code to load the "US economy news" dataset

In [18]:
import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt

## 1: EDA and Preprocessing

Initial EDA

In [2]:
df_news = pd.read_csv('US-Economic-News.csv', delimiter=',', encoding = 'ISO-8859-1')
print(df_news.columns)
print()
print(df_news.shape)


Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'positivity', 'positivity:confidence', 'relevance',
       'relevance:confidence', 'articleid', 'date', 'headline',
       'positivity_gold', 'relevance_gold', 'text'],
      dtype='object')

(8000, 15)


In [None]:
df_news.head(20)

In [4]:
profile = ProfileReport(df_news, title="News Profile Report")
profile.to_file('your_report.html')
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
# Cleaning data and creating a new dataset
df = df_news
df = df.set_index('_unit_id')

# Removing unnecessary data
# 100% missing
df.drop(columns=['positivity_gold', 'relevance_gold'], inplace=True)

# Constant values 
df.drop(columns=['_golden', '_unit_state', '_trusted_judgments'], inplace=True)


In [14]:
df
profile = ProfileReport(df, title='report')
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [25]:
# Positivity and Positivity Correlation have high correlation, however they have missing data that needs to be dealt with.
# Conducting analysis to decide imputation method

positivity_relevance_relation = df.groupby('relevance')['positivity'].mean().reset_index()

# plot instance
plt.figure(figsize=(10, 6))
plt.style.use('ggplot')

# plot without missing values 
plt.hist(df['positivity'].dropna(), bins = 10, edgecolor = 'black')

# labels
plt.title("Distribution of positivity scores")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.show()

  plt.show()


In [24]:

high_positivity_examples = df[df['positivity'] == df['positivity'].max()]['text'].head()
low_positivity_examples = df[df['positivity'] == df['positivity'].min()]['text'].head()

positivity_relevance_relation, high_positivity_examples, low_positivity_examples

(  relevance  positivity
 0        no         NaN
 1  not sure         NaN
 2       yes    4.985211,
 _unit_id
 842615927    DETROIT -- The economy is so good in Michigan ...
 842616831    What a ride! With the economy behaving in almo...
 Name: text, dtype: object,
 _unit_id
 842613518    Author: Ben Levisohn</br></br>Bond yields hit ...
 842613738    Credit-ratings firms say they plan to delay is...
 842613823    The U.S. dollar declined against major foreign...
 842613833    NEW YORK -- Currency traders may be in for a b...
 842613886    When stocks collapsed in a free fall last May,...
 Name: text, dtype: object)

In [None]:
positivity_relevance_relation = df.groupby('relevance')['positivity'].mean().reset_index()

In [None]:
# let's see how many news we have per relevance class
df_news.groupby(['relevance'])['articleid'].count()

In [None]:
df_news['_trusted_judgments'].unique()


In [None]:
len(df_news[df_news['positivity'].isna()])

In [None]:
len(df_news)