#### 1.0 Setting up the environment

In [1]:
#set up
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import texthero as hero
from PIL import Image

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

import datetime
import nltk
from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer

#### 2.0 Loading the Datasets

In [3]:
#reads the datasets
aljazeera = pd.read_csv('n_aljazeera.csv')
bbc = pd.read_csv('n_bbc.csv')
bloomberg = pd.read_csv('n_bloomberg.csv')
conversation_dw = pd.read_csv('n_conversation-dw.csv')
citizen = pd.read_csv('n_citizen.csv')
cgtn_cnbc = pd.read_csv('n_ctgn-cnbc.csv')
east_african = pd.read_csv('n_east_african.csv')
france24 = pd.read_csv('n_france.csv')
guardian = pd.read_csv('n_guardian.csv')
k24 = pd.read_csv('n_k24.csv')
nation = pd.read_csv('n_nation.csv')
nytimes_independent = pd.read_csv('n_nytimes-independent.csv')
people_daily = pd.read_csv('n_pd.csv')
star = pd.read_csv('n_star.csv')
standard = pd.read_csv('n_standard.csv')
observer = pd.read_csv('n_observer.csv')
tribune = pd.read_csv('n_tribune.csv')
independent_uganda = pd.read_csv('n_independent-uganda.csv')
kbc = pd.read_csv('n_kbc.csv')
new_vision = pd.read_csv('n_new_vision.csv')
monitor = pd.read_csv('new_monitor.csv')
capital = pd.read_csv('n_capital-news.csv')

#### 2.1 Preliminary Checks

In [4]:
#Combining the datasets
data = pd.concat([aljazeera, bbc,bloomberg,conversation_dw,cgtn_cnbc, citizen,east_african,france24,guardian,k24,
                 nation,nytimes_independent,people_daily,star,standard, observer, tribune, monitor,
                  independent_uganda,new_vision,kbc,capital])
data.shape

(621, 8)

In [5]:
#check the first 5 rows of the dataframe
data.head()

Unnamed: 0,link,news_country,newspaper_name,text,title,date,keywords,summary
0,https://www.aljazeera.com/videos/2020/04/10/ke...,Qatar,aljazeera,Kenya locust threat: Fears second wave will be...,Kenya locust threat: Fears second wave will be...,10/04/2020,"['threat', 'resources', 'kenya', 'wave', 'harv...",Kenya locust threat: Fears second wave will be...
1,https://www.aljazeera.com/news/2020/1/25/east-...,Qatar,aljazeera,Billions of locusts swarming through East Afri...,East Africa locust outbreak sparks calls for i...,25/01/2020,"['africa', 'outbreak', 'region', 'kenya', 'dro...",Billions of locusts swarming through East Afri...
2,https://www.aljazeera.com/news/2020/4/13/locus...,Qatar,aljazeera,The UN says locusts in Ethiopia have damaged 2...,Locust invasion creates food crisis for 1 mill...,13/04/2020,"['creates', 'ethiopia', 'sudan', 'region', 'li...",The UN says locusts in Ethiopia have damaged 2...
3,https://www.aljazeera.com/gallery/2020/1/20/in...,Qatar,aljazeera,A serious outbreak of locusts is spreading in ...,In Pictures: Locust outbreak spreads across Ea...,20/01/2020,"['africa', 'reported', 'outbreak', 'sudan', 'p...",A serious outbreak of locusts is spreading in ...
4,https://www.aljazeera.com/gallery/2020/2/18/in...,Qatar,aljazeera,Uganda has scrambled to respond to the arrival...,In Pictures: Desert locusts swarm parts of Eas...,18/02/2020,"['africa', 'york', 'million', 'outbreak', 'ken...",Uganda has scrambled to respond to the arrival...


In [6]:
#change name of newspaper_name column
data = data.rename(columns={'newspaper_name':'news_website'})
data.columns

Index(['link', 'news_country', 'news_website', 'text', 'title', 'date',
       'keywords', 'summary'],
      dtype='object')

In [7]:
#Check the data types of the columns
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 621 entries, 0 to 23
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   link          621 non-null    object
 1   news_country  621 non-null    object
 2   news_website  621 non-null    object
 3   text          621 non-null    object
 4   title         621 non-null    object
 5   date          565 non-null    object
 6   keywords      577 non-null    object
 7   summary       577 non-null    object
dtypes: object(8)
memory usage: 43.7+ KB


In [8]:
#Check missing values
data.isnull().sum()

link             0
news_country     0
news_website     0
text             0
title            0
date            56
keywords        44
summary         44
dtype: int64

#### 3.0 Preprocessing

##### 3.1 Cleaning the Text Column

In [9]:
#using texthero for preprocessing
data['clean_text'] = hero.clean(data['text'])
data.head()

Unnamed: 0,link,news_country,news_website,text,title,date,keywords,summary,clean_text
0,https://www.aljazeera.com/videos/2020/04/10/ke...,Qatar,aljazeera,Kenya locust threat: Fears second wave will be...,Kenya locust threat: Fears second wave will be...,10/04/2020,"['threat', 'resources', 'kenya', 'wave', 'harv...",Kenya locust threat: Fears second wave will be...,kenya locust threat fears second wave worse ha...
1,https://www.aljazeera.com/news/2020/1/25/east-...,Qatar,aljazeera,Billions of locusts swarming through East Afri...,East Africa locust outbreak sparks calls for i...,25/01/2020,"['africa', 'outbreak', 'region', 'kenya', 'dro...",Billions of locusts swarming through East Afri...,billions locusts swarming east africa could pr...
2,https://www.aljazeera.com/news/2020/4/13/locus...,Qatar,aljazeera,The UN says locusts in Ethiopia have damaged 2...,Locust invasion creates food crisis for 1 mill...,13/04/2020,"['creates', 'ethiopia', 'sudan', 'region', 'li...",The UN says locusts in Ethiopia have damaged 2...,un says locusts ethiopia damaged hectares crop...
3,https://www.aljazeera.com/gallery/2020/1/20/in...,Qatar,aljazeera,A serious outbreak of locusts is spreading in ...,In Pictures: Locust outbreak spreads across Ea...,20/01/2020,"['africa', 'reported', 'outbreak', 'sudan', 'p...",A serious outbreak of locusts is spreading in ...,serious outbreak locusts spreading parts east ...
4,https://www.aljazeera.com/gallery/2020/2/18/in...,Qatar,aljazeera,Uganda has scrambled to respond to the arrival...,In Pictures: Desert locusts swarm parts of Eas...,18/02/2020,"['africa', 'york', 'million', 'outbreak', 'ken...",Uganda has scrambled to respond to the arrival...,uganda scrambled respond arrival biggest locus...


In [10]:
data.shape

(621, 9)

##### 3.2 Cleaning Title Column

In [11]:
#using texthero for preprocessing
data['clean_title'] = hero.clean(data['title'])
data.head()

Unnamed: 0,link,news_country,news_website,text,title,date,keywords,summary,clean_text,clean_title
0,https://www.aljazeera.com/videos/2020/04/10/ke...,Qatar,aljazeera,Kenya locust threat: Fears second wave will be...,Kenya locust threat: Fears second wave will be...,10/04/2020,"['threat', 'resources', 'kenya', 'wave', 'harv...",Kenya locust threat: Fears second wave will be...,kenya locust threat fears second wave worse ha...,kenya locust threat fears second wave worse ha...
1,https://www.aljazeera.com/news/2020/1/25/east-...,Qatar,aljazeera,Billions of locusts swarming through East Afri...,East Africa locust outbreak sparks calls for i...,25/01/2020,"['africa', 'outbreak', 'region', 'kenya', 'dro...",Billions of locusts swarming through East Afri...,billions locusts swarming east africa could pr...,east africa locust outbreak sparks calls inter...
2,https://www.aljazeera.com/news/2020/4/13/locus...,Qatar,aljazeera,The UN says locusts in Ethiopia have damaged 2...,Locust invasion creates food crisis for 1 mill...,13/04/2020,"['creates', 'ethiopia', 'sudan', 'region', 'li...",The UN says locusts in Ethiopia have damaged 2...,un says locusts ethiopia damaged hectares crop...,locust invasion creates food crisis million et...
3,https://www.aljazeera.com/gallery/2020/1/20/in...,Qatar,aljazeera,A serious outbreak of locusts is spreading in ...,In Pictures: Locust outbreak spreads across Ea...,20/01/2020,"['africa', 'reported', 'outbreak', 'sudan', 'p...",A serious outbreak of locusts is spreading in ...,serious outbreak locusts spreading parts east ...,pictures locust outbreak spreads across east a...
4,https://www.aljazeera.com/gallery/2020/2/18/in...,Qatar,aljazeera,Uganda has scrambled to respond to the arrival...,In Pictures: Desert locusts swarm parts of Eas...,18/02/2020,"['africa', 'york', 'million', 'outbreak', 'ken...",Uganda has scrambled to respond to the arrival...,uganda scrambled respond arrival biggest locus...,pictures desert locusts swarm parts east africa


##### 3.3 Formating Date column

In [12]:
data['date'].unique()

array(['10/04/2020', '25/01/2020', '13/04/2020', '20/01/2020',
       '18/02/2020', '03/04/2020', '09/04/2020', '19/02/2020',
       '21/01/2020', '19/04/2020', '10/02/2020', nan, '22/03/2020',
       '10/09/2020', '05/08/2020', '21/05/2020', '08/04/2020',
       '13/05/2020', '2020-05-13 13:47:28+00:00',
       '2020-08-20 20:41:32+02:00', '2020-10-07 14:56:06+00:00',
       '2020-04-24 09:07:36+00:00', '2020-01-17 09:37:46+00:00',
       '2020-02-03 14:18:37+00:00', '2020-02-25 13:49:33+00:00',
       '2020-03-16 15:49:31+00:00', '18/05/2020', '20/02/2020',
       '28/02/2020', '23/03/2020', '24/02/2020', '04/04/2020',
       '19/05/2020', '02/02/2020', '19/06/2020', '31/12/2019',
       '01/02/2020', '15/08/2020', '14/05/2020', '26/07/2019',
       '14/02/2020', '11/02/2020', '09/02/2020 20:45', '10/02/2020 20:45',
       '19/02/2020 08:18', '20/01/2020 20:45', '10/05/2020 07:40',
       '31/01/2020 20:45', '19/08/2013 08:12', '05/07/2020 13:23',
       '05/07/2020 13:39', '05/07/20

In [13]:
# convert the 'Date' column to datetime format
data['date']= pd.to_datetime(data['date'],utc=True)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 621 entries, 0 to 23
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   link          621 non-null    object             
 1   news_country  621 non-null    object             
 2   news_website  621 non-null    object             
 3   text          621 non-null    object             
 4   title         621 non-null    object             
 5   date          565 non-null    datetime64[ns, UTC]
 6   keywords      577 non-null    object             
 7   summary       577 non-null    object             
 8   clean_text    621 non-null    object             
 9   clean_title   621 non-null    object             
dtypes: datetime64[ns, UTC](1), object(9)
memory usage: 53.4+ KB


In [15]:
data['new_date'] = pd.to_datetime(data['date'].dt.strftime('%Y-%m-%d'))

In [16]:
data.head(2)

Unnamed: 0,link,news_country,news_website,text,title,date,keywords,summary,clean_text,clean_title,new_date
0,https://www.aljazeera.com/videos/2020/04/10/ke...,Qatar,aljazeera,Kenya locust threat: Fears second wave will be...,Kenya locust threat: Fears second wave will be...,2020-10-04 00:00:00+00:00,"['threat', 'resources', 'kenya', 'wave', 'harv...",Kenya locust threat: Fears second wave will be...,kenya locust threat fears second wave worse ha...,kenya locust threat fears second wave worse ha...,2020-10-04
1,https://www.aljazeera.com/news/2020/1/25/east-...,Qatar,aljazeera,Billions of locusts swarming through East Afri...,East Africa locust outbreak sparks calls for i...,2020-01-25 00:00:00+00:00,"['africa', 'outbreak', 'region', 'kenya', 'dro...",Billions of locusts swarming through East Afri...,billions locusts swarming east africa could pr...,east africa locust outbreak sparks calls inter...,2020-01-25


In [17]:
data['DayOfWeek'] = data['new_date'].dt.day_name()

In [18]:
data['month_published'] = data['new_date'].dt.month_name()

In [19]:
data['year_published'] = data['new_date'].dt.year

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 621 entries, 0 to 23
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   link             621 non-null    object             
 1   news_country     621 non-null    object             
 2   news_website     621 non-null    object             
 3   text             621 non-null    object             
 4   title            621 non-null    object             
 5   date             565 non-null    datetime64[ns, UTC]
 6   keywords         577 non-null    object             
 7   summary          577 non-null    object             
 8   clean_text       621 non-null    object             
 9   clean_title      621 non-null    object             
 10  new_date         565 non-null    datetime64[ns]     
 11  DayOfWeek        565 non-null    object             
 12  month_published  565 non-null    object             
 13  year_published   565 

In [21]:
#Saves the csv file of the clean_data
data.to_csv('clean_data.csv', index=False)