# Google News scraping with the GNews package

## Environment set-up and getting the company sample from the sustainability reports

In [None]:
from gnews import GNews
from newspaper import Article
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import requests
import os
import spacy
from tqdm import tqdm

In [None]:
path_reports = '..\\data\\reports'

In [None]:
sample = os.listdir(path_reports)

In [None]:
# converting to panda series because I find it easier to manipulate
company_list = [word.split('.')[0] for word in sample]
company_list = pd.Series(company_list)

We're planning to use these company names in the Google News search engine. First, to do that though, we need to replace the ampersand character with '%26' so that the search engine can read it as an ampersand character - this only applies to P&G and H&M. 

In [None]:
search_list = pd.Series(company_list)
search_list = search_list.str.replace('&','%26')

I also go ahead here and change the names of the companies here so that I can use it when excluding publisher names - see below. This is subject to change based on what kind of websites companies use to share news.

In [None]:
publisher_list = company_list.str.replace('-',' ') #so make it two words
publisher_list = publisher_list.str.replace('ford motor', 'ford')
publisher_list = publisher_list.str.replace('p&g', 'procter')

## Creating a search loop on Google News

First, I define the GNews object - we search through English news, which are published after the 1st January 2021. This does have an effect of limiting results and for further research it might be useful to expand this to different languages and implement translation.

In [None]:
gn = GNews(language = 'en')

In [None]:
##creating a function to filter publishers so as to not include the companies themselves
def filter_publisher(publisher,df):
    company_filter = []
    for i in range(len(df)):
        if (publisher in df['publisher_name'][i].lower())|(publisher in df['publisher_link'][i].lower()):
            company_filter.append(False)
        else:
            company_filter.append(True)
    return df[company_filter]

In [None]:
# # creating a function to filter columns based on keywords
def filter_on_column(keywords,df,column):
    # creating a regex pattern
    pattern = '|'.join(list(set(keywords)))
    # creating the masking filter 
    masking = []
    for string in column:
        if re.search(pattern,string.lower()):
            masking.append(True)
        else: 
            masking.append(False)
    return df[masking]

In [None]:
# # creating a function to filter columns based on keywords
def filter_out_column(keywords,df,column):
    # creating a regex pattern
    pattern = '|'.join(list(set(keywords)))
    # creating the masking filter 
    masking = []
    for string in column:
        if re.search(pattern,string.lower()):
            masking.append(False)
        else: 
            masking.append(True)
    return df[masking]

In [None]:
full_df = pd.DataFrame()
for idx,company in enumerate(search_list[:30]):
    # generate search results - form of a dictionary
    # run three different searches due to Google News having a limit of 100 articles and too many keywords cause the search function to act strangely
    search_1 = gn.get_news(f'allintitle:{company} sustainability OR sustainable OR climate OR environment OR environmental OR pollution OR pollute OR emission OR solar OR recycle OR recycling after:2021-01-01')
    search_2 = gn.get_news(f'allintitle:{company} emissions OR recycles OR recycled OR pollutes OR polluted OR polluting OR wind OR plastic OR deforestation OR greenhouse OR waste OR biodiversity OR renewable after:2021-01-01')
    search_3 = gn.get_news(f'allintitle:{company} reusing OR reuse OR reused OR reuses OR reusable OR biodegradable OR circular OR CO2 OR ecology OR ecological OR ecosystem OR greenwash OR greenwashing after:2021-01-01')
    search = search_1 + search_2 + search_3
    
    # create a temporary data frame from search results
    temp_df = pd.DataFrame.from_dict(search)
    
    # get publisher link and name from the publisher column
    temp_df['publisher_link'] = temp_df['publisher'].apply(lambda x: x['href'])
    temp_df['publisher_name'] = temp_df['publisher'].apply(lambda x: x['title'])
    temp_df.drop(['description','publisher'], axis = 1, inplace = True)
    
    # removing the publisher if the publisher is the company itself by creating a filter that checks for whether the company name is in the publisher name
    temp_df = filter_publisher(publisher_list[idx],temp_df)
    
    # remove any duplicates in temp_df
    temp_df.drop_duplicates(subset = ['title'], inplace = True)
    
    # add the company name as a column
    temp_df['company'] = company_list[idx]
    
    # add the temporary df to our full df
    full_df = pd.concat([full_df, temp_df])

## Getting the full article texts and full article titles

In [None]:
%%capture
text_lst = []
title_lst = []
for link in full_df['url'][2000:]:
    try:
        article = gn.get_full_article(link)
        text_lst.append(article.text)
        title_lst.append(article.title)
    except:
        text_lst.append('webscraping not possible')
        title_lst.append('webscraping not possible')
full_df['text'] = text_lst
full_df['title_full'] = title_lst

In [None]:
df_complete = full_df.copy()
df_complete = pd.read_csv('df_full.csv')

In [None]:
# removing the rows where the webscraping was not possible
df_filtered = df_complete[df_complete['title_full']!='webscraping not possible']

In [None]:
# # creating a function to filter for sustainability topics based on keywords
# def filter_sustainability(keywords,df):
#     # creating a regex pattern
#     pattern = '|'.join(list(set(keywords)))
#     # creating the masking filter 
#     masking = []
#     for title in df['title_full']:
#         if re.search(pattern,title.lower()):
#             masking.append(True)
#         else: 
#             masking.append(False)
#     return df[masking]

In [None]:
# defining the sustainability keywords for topic detection in titles
sust_keywords = ['biodiversity', 'climate', 'ecology', 'environment', 'emission', 'pollution', 'sustainable', 'CO2', 'deforestation', 'greenhouse', 'greenwash', 'COP2', 'pollutant', 'ecosystem', 'waste', 'sustain', 'sustainability', 'solar', 'recycle', 'wind', 'renewable', 'water', 'plastic', 'circular', 'biodegradable']
stemmer = PorterStemmer()
stemmed_sust = [stemmer.stem(word) for word in sust_keywords]

In [None]:
df_filtered =  filter_on_column(stemmed_sust, df_filtered,df_filtered['title_full'])

In [None]:
# removing empty text
df_filtered = df_filtered[df_filtered['text'].astype(bool)]

In [None]:
df_filtered.drop('title', axis = 1, inplace = True)
df_filtered.shape

In [None]:
df_filtered.reset_index(inplace=True, drop=True)

In [None]:
df_filtered['company'].value_counts()[df_filtered['company'].value_counts()<30]

Beiersdorf has very few articles, which is not ideal, but we can work with this for now. To expand the number, I can either include more keywords in the initial search or expand the time range.

# Data Cleaning and Imputation

In [None]:
j = 0
for i,row in df_filtered.iterrows():
    text = row['text']
    try:
        sent_tokenize(text)
    except:
        print(f'error at row {i}')

In [None]:
new_links = ['https://global.chinadaily.com.cn/a/202205/30/WS6294181ba310fd2b29e5fad8.html','https://www.biobased-diesel.com/post/nasa-boeing-gather-data-to-aid-saf-adoption',
            'https://www.bluebiz.com/en/sustainability/innovation-hub/news/boeing-teams-up-with-mit-scientists/#:~:text=Boeing%20is%20partnering%20with%20scientists,the%20carbon%20emissions%20from%20aviation.','https://www.flyingmag.com/boeing-purchases-2-million-gallons-of-sustainable-aviation-fuel/',
            'https://www.upstreamonline.com/energy-transition/chevron-delta-and-google-collaborate-in-biojet-fuel-data-tracking-plan/2-1-1064719', 'https://techtalksummits.com/news/tech-news/cisco-hp-and-dell-chasing-the-huge-360-ecosystem-goal-with-complimentary-tactics',
            'https://euneighbourseast.eu/news/latest-news/ray-of-hope-eu-announces-donation-of-5700-solar-panels-to-ukraine/',
            'https://www.rttnews.com/3312069/nasa-google-team-up-to-help-local-governments-improve-tracking-air-pollution.aspx',
            'https://weibold.com/pyrum-to-recycle-end-of-life-tires-from-mercedes-benz-vehicles-in-future',
            'https://www.intelligentdatacentres.com/2022/11/11/airtrunk-and-clp-power-announce-innovative-renewable-energy-solution-in-hong-kong-for-microsoft/',
            'https://www.eaglevoice.com/news/kirkwood-to-receive-over-300-solar-panels-in-donation/','https://finance.yahoo.com/news/shells-cracker-plant-pollution-prompts-154630697.html',
            'https://lanxess.com/en/Media/Press-Releases/2023/01/LANXESS-and-TotalEnergies-to-cooperate-on-sustainable-styrene',
            'https://www.freightcarbonzero.com/fcz-companies/volvo-trucks/volvo-lng-trucks-assists-arla-foods-in-reducing-carbon-emissions/543.supplierarticle',
             'https://www.prnewswire.com/news-releases/volvo-trucks-showcases-new-zero-emissions-truck-301571323.html',
             'https://thehill.com/policy/equilibrium-sustainability/3819867-walmart-stores-in-6-states-no-longer-provide-single-use-bags-at-checkout-which-states-are-next/'
            ]
#766,859,871,890,1055,1580,1773,2065,3132,3355,3652,4381,5011,5594,5603,5766
#2093,2304,2459,2694,2892,3646,3659,4992,5130,5741 filled in manually
to_drop = [62,234,286,446,696,904,918,932,1341,1565,1634,2440,2458,2514,2592,2640,2734,2829,3328,3411,3534,3617,3718,
          3783,3875,4205,4330,4397,4946,5580,5732,5761]
#606,62 require signup/buying subscription

In [None]:
df_clean = df_filtered.drop(to_drop)

In [None]:
%%capture

# using the new links to get the missing text
j = 0
for i,row in df_clean.iterrows():
    text = row['text']
    try:
        sent_tokenize(text)
    except:
        article = gn.get_full_article(new_links[j])
        row['text'] = article.text
        j=j+1

I need to drop the duplicates in the entire dataframe and the publishers as well, instead of just running it on the temporary dataframe above. This is because certain companies may share certain articles.

In [None]:
df_clean.drop_duplicates(subset = ['title_full'], inplace = True)
df_clean.shape

In [None]:
checking = filter_on_column(publisher_list, df_clean, df_clean['publisher_name'])
#found Microsoft and Walmart corporate

In [None]:
df_clean = df_clean[df_clean['publisher_name'] != 'Microsoft']
df_clean = df_clean[df_clean['publisher_name'] != 'Walmart Corporate']

In [None]:
df_clean.reset_index(inplace=True, drop=True)

In [None]:
# df_clean.to_csv('df_clean.csv',index = False)

## Removing duplicate articles based on title similarity

In [None]:
import torch
from sentence_transformers import SentenceTransformer, util, models
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') # this model was trained for paraphrasing so it should work quite well for this task

In [None]:
# define a function to remove duplicate articles based on title similarity
def remove_duplicates(df, threshold):
    titles = df['title_full'].tolist()
    embeddings = model.encode(titles)
    indices = []
    for i, emb1 in enumerate(embeddings):
        if i in indices:
            continue
        for j, emb2 in enumerate(embeddings[i+1:]):
            if get_similarity(emb1, emb2) > threshold:
                indices.append(j+i+1)
    return df.drop(indices)

In [None]:
df_nodup = remove_duplicates(df_clean, 0.8)
df_nodup.shape

## Using Spacy to check for only named entities as companies

In [None]:
nlp = spacy.load('en_core_web_trf')

In [None]:
# checking for companies, for which the name could be ambiguous
checklist = ['apple','shell'] 

In [None]:
ner_filter = []
for i,row in tqdm(df_nodup.iterrows(), total=df_nodup.shape[0]):
    title = row['title_full']
    company_raw = row['company']
    if company_raw in checklist:
        company = company_raw.replace('-',' ')
        if company == 'mcdonald':
            company = "mcdonald's"
        else:
            company = company 
        doc = nlp(title)
        entity_list = [str(entity).lower() for entity in list(doc.ents)]
        if company not in ' '.join(entity_list):
            ner_filter.append(i)
    else:
        continue

In [None]:
# we find there is one missing value for the title full - removed 
df_nodup[df_nodup['title_full'].isnull()]
df_nodup.dropna(subset = ['title_full'], inplace = True)

In [None]:
df_nodup.drop(ner_filter, inplace = True)

In [None]:
df_nodup.shape

## Filtering out company communication keywords

There are some articles, which just summarize the company's sustainability report, which we do not want as we want to separate the substantial from symbolic actions based on the report/news document type. We will filter out these keywords.

In [None]:
communication_keywords = ['annual report','progress report', 'sustainability report','impact report',
                          'financial report','ESG performance report','environmental report',
                         'head of sustainability', 'sustainability head', 'chief of sustainability']

In [None]:
df_nodup = filter_out_column(communication_keywords, df_nodup, df_nodup['title_full'])

In [None]:
df_nodup.reset_index(inplace = True, drop = True)

## Only keeping articles published from 2021 onwards and only keeping McDonald's or Ronald McDonald in the dataset

In [None]:
# df_clean = df_nodup.copy()
df_clean = pd.read_csv('df_clean.csv')

In [None]:
df_clean['published date'] = pd.to_datetime(df_clean['published date'])
df_clean['published_year'] = df_clean['published date'].dt.year

In [None]:
df_clean = df_clean[df_clean['published_year']>2020]

In [None]:
non_mcdonalds = []
for i,row in df_clean[df_clean['company'] == 'mcdonald'].iterrows():
    title = row['title_full']
    if ("mcdonald's" in title.lower())|("ronald mcdonald" in title.lower())|("mcdonald’s" in title.lower()):
        continue
    else:
        non_mcdonalds.append(i)

len(non_mcdonalds)

In [None]:
df_clean.drop(non_mcdonalds, inplace = True)
df_clean.shape

## Visualizations included in the thesis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Group the data by year and count the number of articles
df_by_year = df_clean.groupby('published_year')['text'].count()

# Group the data by company and count the number of articles
df_by_company = df_clean.groupby('company')['text'].count()

# Create a bar chart of the number of articles per year
plt.figure(figsize=(10,5))
sns.barplot(x=df_by_year.index, y=df_by_year.values, color='skyblue')
plt.title('Number of Articles per Year')
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.savefig('articles_per_year.png', dpi=300)
plt.show()

# Create a bar chart of the number of articles per company
plt.figure(figsize=(10,5))
sns.barplot(x=df_by_company.index, y=df_by_company.values, color='salmon')
plt.title('Number of Articles per Company')
plt.xlabel('Company')
plt.ylabel('Number of Articles')
plt.xticks(rotation=90) # Rotate x-axis labels by 90 degrees
plt.savefig('articles_per_company.png', dpi=300)
plt.show()

## Creating and cleaning the article sentence dataframe

In [None]:
# create a new dataframe to store the sentences
df_article = pd.DataFrame(columns=['doc_type','company','sentence'])

# loop over each row in the original dataframe and split the text into sentences
for i, row in tqdm(df_clean.iterrows(),total = df_clean.shape[0]):
    company = row['company']
    text = row['text']
    
    # use the sentence tokenizer to split the text into sentences
    sent_lst = []
    for token in sent_tokenize(text):
        sentences = token.split('\n\n')
        for sentence in sentences:
            # dealing with new lines inside the text
            r_sent = ' '.join(sentence.split())
            sent_lst.append(r_sent)
    
    # append each sentence as a new row in the df_article dataframe
    for sentence in sent_lst:
        temp = pd.DataFrame(
            {
                'doc_type': ['news'],
                'company': [company],
                'sentence': [sentence]
            }
        )
        df_article = pd.concat([df_article,temp],ignore_index = True)

## Remove non-ASCII values and other things

In [None]:
# remove non-ASCII characters since BERT can't read those
df_article['sentence'] = df_article['sentence'].str.replace('’',"'")
df_article['sentence'].replace(r'[^\x00-\x7F]+','', regex=True, inplace=True)

In [None]:
df_article["sentence"] = df_article["sentence"].str.replace('"','')
df_article['sentence'] = df_article['sentence'].str.replace('Nestl ', 'Nestle ')
df_article['sentence'] = df_article['sentence'].str.replace('Mondel z', 'Mondelez')
df_article["sentence"] = df_article["sentence"].replace(r'http\S+|\[.\]:?|www\S+|\w+/\S+|\w+-\w+-\S+|\[|\]','',regex = True).replace(r'^\s+|\s+$','',regex=True).replace(r'\s{2,}',' ',regex=True)
# original: http\S+|\[.\]:?|www\S+|\w+/\S+|\w+-\w+-\S+
#remove double spaces with one space and remove most hyperlinks + remove whitespaces at the end and beginning of a sentence
df_article["word count"] = [len(i) for i in df_article["sentence"].str.split()]

In [None]:
df_article = df_article[df_article["word count"] > 5]
df_article = df_article[df_article["word count"] < 100]

In [None]:
# dropping any duplicate sentences - surprisingly there are a lot of them
df_article.drop_duplicates(subset = ['sentence'], inplace = True)

In [None]:
df_article.shape

In [None]:
# define a function to check if a sentence is comprised of more than half uppercase characters
def is_mostly_uppercase(sentence):
    return sum(1 for c in sentence if c.isupper()) / len(sentence) > 0.5

# apply the function to the 'sentence' column and filter out the rows where the condition is True
df_article = df_article[~df_article['sentence'].apply(is_mostly_uppercase)]

# print the resulting dataframe
df_article

In [None]:
df_article.reset_index(inplace = True, drop = True)

In [None]:
df_article.to_csv(os.path.join(path_data, 'article_sentences_gnews.csv'), index = False)