In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os, sys
import re
import json
import glob
import datetime
from collections import Counter

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from wordcloud import WordCloud

In [3]:
os.chdir('..')

In [4]:
from src.loader import NewsDataLoader
import src.countries_region as countries_region 

In [5]:
from src.config import cfg

Output File: news_output.csv
Path: data
News Data: rating.csv
Traffic Data: trafiic.csv
Domain Location Data: domains_location.csv


In [6]:
import src.utils as utils

In [7]:
news_data_loader = NewsDataLoader(cfg.path)

In [8]:
# get news csv file
news = news_data_loader.get_news()

# get traffic csv file
traffic = news_data_loader.get_traffic()

# get domain location data csv file
domain = news_data_loader.get_domain_location()

In [9]:
df = pd.read_csv(news)
traffic_df = pd.read_csv(traffic)
domain_df = pd.read_csv(domain)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58356 entries, 0 to 58355
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   article_id       58356 non-null  int64 
 1   source_id        17771 non-null  object
 2   source_name      58356 non-null  object
 3   author           56193 non-null  object
 4   title            58356 non-null  object
 5   description      58346 non-null  object
 6   url              58356 non-null  object
 7   url_to_image     54905 non-null  object
 8   published_at     58356 non-null  object
 9   content          58356 non-null  object
 10  category         58335 non-null  object
 11  article          58356 non-null  object
 12  title_sentiment  58356 non-null  object
dtypes: int64(1), object(12)
memory usage: 5.8+ MB


In [11]:
traffic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   GlobalRank      1000000 non-null  int64 
 1   TldRank         1000000 non-null  int64 
 2   Domain          1000000 non-null  object
 3   TLD             1000000 non-null  object
 4   RefSubNets      1000000 non-null  int64 
 5   RefIPs          1000000 non-null  int64 
 6   IDN_Domain      1000000 non-null  object
 7   IDN_TLD         1000000 non-null  object
 8   PrevGlobalRank  1000000 non-null  int64 
 9   PrevTldRank     1000000 non-null  int64 
 10  PrevRefSubNets  1000000 non-null  int64 
 11  PrevRefIPs      1000000 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 91.6+ MB


In [12]:
domain_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37802 entries, 0 to 37801
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   SourceCommonName  37802 non-null  object
 1   location          37802 non-null  object
 2   Country           37801 non-null  object
dtypes: object(3)
memory usage: 886.1+ KB


In [13]:
df.isna().sum()

article_id             0
source_id          40585
source_name            0
author              2163
title                  0
description           10
url                    0
url_to_image        3451
published_at           0
content                0
category              21
article                0
title_sentiment        0
dtype: int64

In [14]:
domain_df.isna().sum()

SourceCommonName    0
location            0
Country             1
dtype: int64

In [15]:

traffic_df.isna().sum()

GlobalRank        0
TldRank           0
Domain            0
TLD               0
RefSubNets        0
RefIPs            0
IDN_Domain        0
IDN_TLD           0
PrevGlobalRank    0
PrevTldRank       0
PrevRefSubNets    0
PrevRefIPs        0
dtype: int64

In [16]:
df = df.drop_duplicates()
df = df.dropna()

traffic_df = traffic_df.drop_duplicates()
traffic_df = traffic_df.dropna()

domain_df = domain_df.drop_duplicates()
domain_df = domain_df.dropna()

In [17]:
df.isna().sum()

article_id         0
source_id          0
source_name        0
author             0
title              0
description        0
url                0
url_to_image       0
published_at       0
content            0
category           0
article            0
title_sentiment    0
dtype: int64

In [18]:
traffic_df.isna().sum()

GlobalRank        0
TldRank           0
Domain            0
TLD               0
RefSubNets        0
RefIPs            0
IDN_Domain        0
IDN_TLD           0
PrevGlobalRank    0
PrevTldRank       0
PrevRefSubNets    0
PrevRefIPs        0
dtype: int64

In [19]:
domain_df.isna().sum()

SourceCommonName    0
location            0
Country             0
dtype: int64

In [20]:
df.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,article,title_sentiment
2,81694,time,Time,Christina Larson / AP,amphibians are the world’s most vulnerable spe...,"The world’s frogs, salamanders, newts, and oth...",https://time.com/6320467/amphibians-most-vulne...,https://api.time.com/wp-content/uploads/2023/1...,2023-10-04 17:36:18.000000,"The worlds frogs, salamanders, newts and other...",Madagascar,"the world’s frogs, salamanders, newts and othe...",Negative
5,81707,al-jazeera-english,Al Jazeera English,Al Jazeera,unga calls for humanitarian truce in israel-ha...,The resolution passed with 120 votes in favour...,https://www.aljazeera.com/news/2023/10/27/unga...,https://www.aljazeera.com/wp-content/uploads/2...,2023-10-27 22:16:05.000000,The United Nations General Assembly has passed...,Madagascar,the resolution passed with 120 votes in favour...,Neutral
6,81717,bbc-news,BBC News,https://www.facebook.com/bbcnews,malawi heatwave warning issued as temperatures...,"Temperatures could soar to 44C, the weather bu...",https://www.bbc.co.uk/news/world-africa-67086254,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-10-12 10:40:48.000000,Malawians have been warned of an extreme heatw...,Madagascar,malawians have been warned of an extreme heatw...,Neutral
11,81739,rt,RT,RT,russian marine scientists to conduct african r...,Russian scientists from VNIRO will conduct res...,https://www.rt.com/africa/585847-russian-scien...,https://mf.b37mrtl.ru/files/2023.10/article/65...,2023-10-26 11:04:42.000000,Scientists from the All-Russian Research Insti...,Madagascar,scientists from the all-russian research insti...,Neutral
16,81755,al-jazeera-english,Al Jazeera English,Al Jazeera,‘kids whose insides are destroyed’: the return...,Mauritius grapples with a drug epidemic of unp...,https://www.aljazeera.com/features/2023/10/4/k...,https://www.aljazeera.com/wp-content/uploads/2...,2023-10-04 14:43:28.000000,"Port Louis, Mauritius Until early 2022, Gaetan...",Madagascar,the island nation is grappling with a drug epi...,Negative


In [21]:
domain_df.head()

Unnamed: 0,SourceCommonName,location,Country
0,00221.info,SG,Senegal
1,01net.com,FR,France
2,01net.it,IT,Italy
3,0223.com.ar,AR,Argentina
4,022china.com,CH,China


In [22]:
traffic_df.head()

Unnamed: 0,GlobalRank,TldRank,Domain,TLD,RefSubNets,RefIPs,IDN_Domain,IDN_TLD,PrevGlobalRank,PrevTldRank,PrevRefSubNets,PrevRefIPs
0,1,1,google.com,com,471274,2151358,google.com,com,1,1,471248,2147402
1,2,2,facebook.com,com,465424,2250631,facebook.com,com,2,2,465297,2247135
2,3,3,youtube.com,com,420579,1852995,youtube.com,com,3,3,420483,1848940
3,4,4,twitter.com,com,404985,1757127,twitter.com,com,4,4,404998,1753939
4,5,5,instagram.com,com,364746,1598085,instagram.com,com,5,5,364618,1594892


In [23]:
df["title_sentiment"].value_counts()

Neutral     10187
Negative     4912
Positive     1701
Name: title_sentiment, dtype: int64

In [24]:
df.info()
# traffic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16800 entries, 2 to 58349
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   article_id       16800 non-null  int64 
 1   source_id        16800 non-null  object
 2   source_name      16800 non-null  object
 3   author           16800 non-null  object
 4   title            16800 non-null  object
 5   description      16800 non-null  object
 6   url              16800 non-null  object
 7   url_to_image     16800 non-null  object
 8   published_at     16800 non-null  object
 9   content          16800 non-null  object
 10  category         16800 non-null  object
 11  article          16800 non-null  object
 12  title_sentiment  16800 non-null  object
dtypes: int64(1), object(12)
memory usage: 1.8+ MB


In [25]:
# Count the number of articles per source_name
article_counts = df['source_name'].value_counts()

# Get the top 10 websites
top_10_websites = article_counts.head(10)

# Get the bottom 10 websites
bottom_10_websites = article_counts.tail(10)

In [26]:
print("Top 10 websites with the largest count of news articles:")
print(top_10_websites)
print("\nBottom 10 websites with the smallest count of news articles:")
print(bottom_10_websites)

Top 10 websites with the largest count of news articles:
The Times of India    6987
Business Insider      1968
ABC News              1946
BBC News              1939
Al Jazeera English    1587
RT                    1086
Time                   578
Wired                  252
CNN                    252
The Verge              205
Name: source_name, dtype: int64

Bottom 10 websites with the smallest count of news articles:
The Times of India    6987
Business Insider      1968
ABC News              1946
BBC News              1939
Al Jazeera English    1587
RT                    1086
Time                   578
Wired                  252
CNN                    252
The Verge              205
Name: source_name, dtype: int64


In [27]:
# url_list = df['url']
# utils.break_combined_urls(url_list)

In [28]:
sorted_df = traffic_df.sort_values('GlobalRank')

# Display the top 10 websites
top_10_websites = sorted_df['Domain'].head(10)

bottom_10_websites = sorted_df['Domain'].tail(10)

print("Top 10 websites with the highest numbers of visitor traffic:\n")
print(top_10_websites)

print("\n\nBottom 10 websites visitor traffic: \n")
print(bottom_10_websites)

Top 10 websites with the highest numbers of visitor traffic:

0              google.com
1            facebook.com
2             youtube.com
3             twitter.com
4           instagram.com
5            linkedin.com
6               apple.com
7           microsoft.com
8    googletagmanager.com
9           wikipedia.org
Name: Domain, dtype: object


Bottom 10 websites visitor traffic: 

999990              eiretrip.com
999991      exploring-africa.com
999992                  hmag.com
999993            irishcycle.com
999994           keith-baker.com
999995                kireie.com
999996               mt-lock.com
999997             pinkwater.com
999998            soderhomes.com
999999    toyotamusicfactory.com
Name: Domain, dtype: object


In [29]:
# Countries with the highest number of news media organisations (represented by domains in the data)
top10 = domain_df['Country'].value_counts().head(10)

bottom10 = domain_df['Country'].value_counts().tail(10)

print("Top 10 countries with the highest number of news media organisations:\n")
print(top10)
print("\n\nBottom 10 countries with the lowest number of news media organisations:\n")
print(bottom10)


Top 10 countries with the highest number of news media organisations:

United States     14111
United Kingdom     1946
Italy              1804
France             1039
Russia             1020
Canada              886
Germany             884
China               779
Turkey              725
India               686
Name: Country, dtype: int64


Bottom 10 countries with the lowest number of news media organisations:

Greenland         1
Guernsey          1
Isle of Man       1
Cook Islands      1
Guinea-Bissau     1
Micronesia        1
Aruba             1
American Samoa    1
Guadeloupe        1
Saint Helena      1
Name: Country, dtype: int64


In [30]:
category = df['category'].value_counts()
countries = domain_df['Country'].unique()
# Countries that have many articles written about them
country_categories = category[category.index.isin(countries)]
print("Top 10\n")
print(country_categories.head(10))
print("\nBottom 10\n")
print(country_categories.tail(10))

Top 10

India          867
Ukraine        344
Australia      293
Israel         224
Egypt          166
Mexico         138
China          136
Pakistan       136
New Zealand    131
Afghanistan    116
Name: category, dtype: int64

Bottom 10

Montserrat       2
Burkina Faso     2
Guernsey         2
Seychelles       2
Costa Rica       2
Martinique       1
Eritrea          1
Liechtenstein    1
Belize           1
Tajikistan       1
Name: category, dtype: int64


In [51]:
# Websites that reported (the news content) about Africa
region = countries_region.get_region()

df['Region'] = df['category'].map(region)

df[df['Region'] == 'Africa']['source_name'].value_counts()


The Times of India    194
BBC News              136
RT                    129
Al Jazeera English    116
ABC News              105
CNN                    29
Business Insider       21
Time                   20
Wired                   2
The Verge               1
Name: source_name, dtype: int64

In [52]:
# Websites that reported (the news content) about Europe

df[df['Region'] == 'Europe']['source_name'].value_counts()

RT                    246
The Times of India    231
ABC News              198
Business Insider      189
Al Jazeera English    178
BBC News              113
Time                   30
CNN                    23
Wired                   9
The Verge               4
Name: source_name, dtype: int64

In [53]:
# Websites that reported (the news content) about Middle East and Asia

df[df['Region'] == 'Asia']['source_name'].value_counts()

The Times of India    1379
Al Jazeera English     372
ABC News               241
RT                     201
Business Insider       164
BBC News               143
Time                    83
CNN                     29
Wired                    7
The Verge                3
Name: source_name, dtype: int64

In [67]:
# Websites that reported (the news content) about China

df[df['Region'] == 'China']['source_name'].value_counts()

The Times of India    63
RT                    20
Al Jazeera English    17
Business Insider      15
Time                   9
ABC News               6
BBC News               4
CNN                    2
Name: source_name, dtype: int64

In [74]:
# Websites that reported (the news content) about US

df[df['category'] == 'America']['source_name'].value_counts()

Business Insider      50
ABC News              25
The Times of India    15
Time                  12
BBC News               9
Al Jazeera English     3
The Verge              2
RT                     2
CNN                    1
Name: source_name, dtype: int64

In [35]:
#Websites with the highest count of positive sentiment news articles
df[df['title_sentiment'] == 'Positive']['source_name'].value_counts().head(10)

The Times of India    1055
Business Insider       222
ABC News               133
BBC News                64
Wired                   63
The Verge               57
Time                    32
Al Jazeera English      30
RT                      24
CNN                     21
Name: source_name, dtype: int64

In [36]:
#Websites with the highest count of negative sentiment news articles
df[df['title_sentiment'] == 'Negative']['source_name'].value_counts().head(10)


The Times of India    1130
Business Insider       869
BBC News               791
ABC News               693
Al Jazeera English     665
RT                     390
Time                   180
CNN                     98
Wired                   62
The Verge               34
Name: source_name, dtype: int64

In [37]:
#Websites with the highest count of neutral sentiment news articles
df[df['title_sentiment'] == 'Neutral']['source_name'].value_counts().head(10)

The Times of India    4802
ABC News              1120
BBC News              1084
Al Jazeera English     892
Business Insider       877
RT                     672
Time                   366
CNN                    133
Wired                  127
The Verge              114
Name: source_name, dtype: int64